CINXE.COM
Search | arXiv e-print repository
<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 1,203 results for author: <span class="mathjax">Li, R</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Li%2C+R">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Li, R"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Li%2C+R&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Li, R"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Li%2C+R&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Li%2C+R&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Li%2C+R&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Li%2C+R&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Li%2C+R&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Li%2C+R&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.18425">arXiv:2411.18425</a> <span> [<a href="https://arxiv.org/pdf/2411.18425">pdf</a>, <a href="https://arxiv.org/format/2411.18425">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Streamlining Prediction in Bayesian Deep Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+R">Rui Li</a>, <a href="/search/cs?searchtype=author&query=Klasson%2C+M">Marcus Klasson</a>, <a href="/search/cs?searchtype=author&query=Solin%2C+A">Arno Solin</a>, <a href="/search/cs?searchtype=author&query=Trapp%2C+M">Martin Trapp</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.18425v1-abstract-short" style="display: inline;"> The rising interest in Bayesian deep learning (BDL) has led to a plethora of methods for estimating the posterior distribution. However, efficient computation of inferences, such as predictions, has been largely overlooked with Monte Carlo integration remaining the standard. In this work we examine streamlining prediction in BDL through a single forward pass without sampling. For this we use local… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18425v1-abstract-full').style.display = 'inline'; document.getElementById('2411.18425v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.18425v1-abstract-full" style="display: none;"> The rising interest in Bayesian deep learning (BDL) has led to a plethora of methods for estimating the posterior distribution. However, efficient computation of inferences, such as predictions, has been largely overlooked with Monte Carlo integration remaining the standard. In this work we examine streamlining prediction in BDL through a single forward pass without sampling. For this we use local linearisation on activation functions and local Gaussian approximations at linear layers. Thus allowing us to analytically compute an approximation to the posterior predictive distribution. We showcase our approach for both MLP and transformers, such as ViT and GPT-2, and assess its performance on regression and classification tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18425v1-abstract-full').style.display = 'none'; document.getElementById('2411.18425v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.17558">arXiv:2411.17558</a> <span> [<a href="https://arxiv.org/pdf/2411.17558">pdf</a>, <a href="https://arxiv.org/format/2411.17558">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Natural Language Understanding and Inference with MLLM in Visual Question Answering: A Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kuang%2C+J">Jiayi Kuang</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+J">Jingyou Xie</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+H">Haohao Luo</a>, <a href="/search/cs?searchtype=author&query=Li%2C+R">Ronghao Li</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zhe Xu</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+X">Xianfeng Cheng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yinghui Li</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+X">Xika Lin</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+Y">Ying Shen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.17558v1-abstract-short" style="display: inline;"> Visual Question Answering (VQA) is a challenge task that combines natural language processing and computer vision techniques and gradually becomes a benchmark test task in multimodal large language models (MLLMs). The goal of our survey is to provide an overview of the development of VQA and a detailed description of the latest models with high timeliness. This survey gives an up-to-date synthesis… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17558v1-abstract-full').style.display = 'inline'; document.getElementById('2411.17558v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.17558v1-abstract-full" style="display: none;"> Visual Question Answering (VQA) is a challenge task that combines natural language processing and computer vision techniques and gradually becomes a benchmark test task in multimodal large language models (MLLMs). The goal of our survey is to provide an overview of the development of VQA and a detailed description of the latest models with high timeliness. This survey gives an up-to-date synthesis of natural language understanding of images and text, as well as the knowledge reasoning module based on image-question information on the core VQA tasks. In addition, we elaborate on recent advances in extracting and fusing modal information with vision-language pretraining models and multimodal large language models in VQA. We also exhaustively review the progress of knowledge reasoning in VQA by detailing the extraction of internal knowledge and the introduction of external knowledge. Finally, we present the datasets of VQA and different evaluation metrics and discuss possible directions for future work. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17558v1-abstract-full').style.display = 'none'; document.getElementById('2411.17558v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.16991">arXiv:2411.16991</a> <span> [<a href="https://arxiv.org/pdf/2411.16991">pdf</a>, <a href="https://arxiv.org/format/2411.16991">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Dynamic Self-Distillation via Previous Mini-batches for Fine-tuning Small Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fu%2C+Y">Yao Fu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Y">Yin Yu</a>, <a href="/search/cs?searchtype=author&query=Han%2C+X">Xiaotian Han</a>, <a href="/search/cs?searchtype=author&query=Li%2C+R">Runchao Li</a>, <a href="/search/cs?searchtype=author&query=Long%2C+X">Xianxuan Long</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+H">Haotian Yu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+P">Pan Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.16991v1-abstract-short" style="display: inline;"> Knowledge distillation (KD) has become a widely adopted approach for compressing large language models (LLMs) to reduce computational costs and memory footprints. However, the availability of complex teacher models is a prerequisite for running most KD pipelines. Thus, the traditional KD procedure can be unachievable or budget-unfriendly, particularly when relying on commercial LLMs like GPT4. In… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16991v1-abstract-full').style.display = 'inline'; document.getElementById('2411.16991v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.16991v1-abstract-full" style="display: none;"> Knowledge distillation (KD) has become a widely adopted approach for compressing large language models (LLMs) to reduce computational costs and memory footprints. However, the availability of complex teacher models is a prerequisite for running most KD pipelines. Thus, the traditional KD procedure can be unachievable or budget-unfriendly, particularly when relying on commercial LLMs like GPT4. In this regard, Self-distillation (SelfD) emerges as an advisable alternative, enabling student models to learn without teachers' guidance. Nonetheless, existing SelfD approaches for LMs often involve architectural modifications, assuming the models are open-source, which may not always be practical. In this work, we introduce a model-agnostic and task-agnostic method named dynamic SelfD from the previous minibatch (DynSDPB), which realizes current iterations' distillation from the last ones' generated logits. Additionally, to address prediction inaccuracies during the early iterations, we dynamically adjust the distillation influence and temperature values to enhance the adaptability of fine-tuning. Furthermore, DynSDPB is a novel fine-tuning policy that facilitates the seamless integration of existing self-correction and self-training techniques for small language models (SLMs) because they all require updating SLMs' parameters. We demonstrate the superior performance of DynSDPB on both encoder-only LMs (e.g., BERT model families) and decoder-only LMs (e.g., LLaMA model families), validating its effectiveness across natural language understanding (NLU) and natural language generation (NLG) benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16991v1-abstract-full').style.display = 'none'; document.getElementById('2411.16991v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Work in progress</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.16196">arXiv:2411.16196</a> <span> [<a href="https://arxiv.org/pdf/2411.16196">pdf</a>, <a href="https://arxiv.org/format/2411.16196">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Learn from Foundation Model: Fruit Detection Model without Manual Annotation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yanan Wang</a>, <a href="/search/cs?searchtype=author&query=Fei%2C+Z">Zhenghao Fei</a>, <a href="/search/cs?searchtype=author&query=Li%2C+R">Ruichen Li</a>, <a href="/search/cs?searchtype=author&query=Ying%2C+Y">Yibin Ying</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.16196v1-abstract-short" style="display: inline;"> Recent breakthroughs in large foundation models have enabled the possibility of transferring knowledge pre-trained on vast datasets to domains with limited data availability. Agriculture is one of the domains that lacks sufficient data. This study proposes a framework to train effective, domain-specific, small models from foundation models without manual annotation. Our approach begins with SDM (S… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16196v1-abstract-full').style.display = 'inline'; document.getElementById('2411.16196v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.16196v1-abstract-full" style="display: none;"> Recent breakthroughs in large foundation models have enabled the possibility of transferring knowledge pre-trained on vast datasets to domains with limited data availability. Agriculture is one of the domains that lacks sufficient data. This study proposes a framework to train effective, domain-specific, small models from foundation models without manual annotation. Our approach begins with SDM (Segmentation-Description-Matching), a stage that leverages two foundation models: SAM2 (Segment Anything in Images and Videos) for segmentation and OpenCLIP (Open Contrastive Language-Image Pretraining) for zero-shot open-vocabulary classification. In the second stage, a novel knowledge distillation mechanism is utilized to distill compact, edge-deployable models from SDM, enhancing both inference speed and perception accuracy. The complete method, termed SDM-D (Segmentation-Description-Matching-Distilling), demonstrates strong performance across various fruit detection tasks object detection, semantic segmentation, and instance segmentation) without manual annotation. It nearly matches the performance of models trained with abundant labels. Notably, SDM-D outperforms open-set detection methods such as Grounding SAM and YOLO-World on all tested fruit detection datasets. Additionally, we introduce MegaFruits, a comprehensive fruit segmentation dataset encompassing over 25,000 images, and all code and datasets are made publicly available at https://github.com/AgRoboticsResearch/SDM-D.git. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16196v1-abstract-full').style.display = 'none'; document.getElementById('2411.16196v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages, 12 figures, conference or other essential info</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15847">arXiv:2411.15847</a> <span> [<a href="https://arxiv.org/pdf/2411.15847">pdf</a>, <a href="https://arxiv.org/format/2411.15847">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> FedQP: Towards Accurate Federated Learning using Quadratic Programming Guided Mutation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Weng%2C+J">Jiawen Weng</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+Z">Zeke Xia</a>, <a href="/search/cs?searchtype=author&query=Li%2C+R">Ran Li</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+M">Ming Hu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+M">Mingsong Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15847v1-abstract-short" style="display: inline;"> Due to the advantages of privacy-preserving, Federated Learning (FL) is widely used in distributed machine learning systems. However, existing FL methods suffer from low-inference performance caused by data heterogeneity. Specifically, due to heterogeneous data, the optimization directions of different local models vary greatly, making it difficult for the traditional FL method to get a generalize… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15847v1-abstract-full').style.display = 'inline'; document.getElementById('2411.15847v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15847v1-abstract-full" style="display: none;"> Due to the advantages of privacy-preserving, Federated Learning (FL) is widely used in distributed machine learning systems. However, existing FL methods suffer from low-inference performance caused by data heterogeneity. Specifically, due to heterogeneous data, the optimization directions of different local models vary greatly, making it difficult for the traditional FL method to get a generalized global model that performs well on all clients. As one of the state-of-the-art FL methods, the mutation-based FL method attempts to adopt a stochastic mutation strategy to guide the model training towards a well-generalized area (i.e., flat area in the loss landscape). Specifically, mutation allows the model to shift within the solution space, providing an opportunity to escape areas with poor generalization (i.e., sharp area). However, the stochastic mutation strategy easily results in diverse optimal directions of mutated models, which limits the performance of the existing mutation-based FL method. To achieve higher performance, this paper proposes a novel mutation-based FL approach named FedQP, utilizing a quadratic programming strategy to regulate the mutation directions wisely. By biasing the model mutation towards the direction of gradient update rather than traditional random mutation, FedQP can effectively guide the model to optimize towards a well-generalized area (i.e., flat area). Experiments on multiple well-known datasets show that our quadratic programming-guided mutation strategy effectively improves the inference accuracy of the global model in various heterogeneous data scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15847v1-abstract-full').style.display = 'none'; document.getElementById('2411.15847v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">SEKE 2024, 6 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15551">arXiv:2411.15551</a> <span> [<a href="https://arxiv.org/pdf/2411.15551">pdf</a>, <a href="https://arxiv.org/format/2411.15551">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> NeRF Inpainting with Geometric Diffusion Prior and Balanced Score Distillation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Menglin Zhang</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+X">Xin Luo</a>, <a href="/search/cs?searchtype=author&query=Lan%2C+Y">Yunwei Lan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+C">Chang Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+R">Rui Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kaidong Zhang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+G">Ganlin Yang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+D">Dong Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15551v1-abstract-short" style="display: inline;"> Recent advances in NeRF inpainting have leveraged pretrained diffusion models to enhance performance. However, these methods often yield suboptimal results due to their ineffective utilization of 2D diffusion priors. The limitations manifest in two critical aspects: the inadequate capture of geometric information by pretrained diffusion models and the suboptimal guidance provided by existing Score… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15551v1-abstract-full').style.display = 'inline'; document.getElementById('2411.15551v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15551v1-abstract-full" style="display: none;"> Recent advances in NeRF inpainting have leveraged pretrained diffusion models to enhance performance. However, these methods often yield suboptimal results due to their ineffective utilization of 2D diffusion priors. The limitations manifest in two critical aspects: the inadequate capture of geometric information by pretrained diffusion models and the suboptimal guidance provided by existing Score Distillation Sampling (SDS) methods. To address these problems, we introduce GB-NeRF, a novel framework that enhances NeRF inpainting through improved utilization of 2D diffusion priors. Our approach incorporates two key innovations: a fine-tuning strategy that simultaneously learns appearance and geometric priors and a specialized normal distillation loss that integrates these geometric priors into NeRF inpainting. We propose a technique called Balanced Score Distillation (BSD) that surpasses existing methods such as Score Distillation (SDS) and the improved version, Conditional Score Distillation (CSD). BSD offers improved inpainting quality in appearance and geometric aspects. Extensive experiments show that our method provides superior appearance fidelity and geometric consistency compared to existing approaches. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15551v1-abstract-full').style.display = 'none'; document.getElementById('2411.15551v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15468">arXiv:2411.15468</a> <span> [<a href="https://arxiv.org/pdf/2411.15468">pdf</a>, <a href="https://arxiv.org/format/2411.15468">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> SplatSDF: Boosting Neural Implicit SDF via Gaussian Splatting Fusion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+R+B">Runfa Blark Li</a>, <a href="/search/cs?searchtype=author&query=Suzuki%2C+K">Keito Suzuki</a>, <a href="/search/cs?searchtype=author&query=Du%2C+B">Bang Du</a>, <a href="/search/cs?searchtype=author&query=Le%2C+K+M+B">Ki Myung Brian Le</a>, <a href="/search/cs?searchtype=author&query=Atanasov%2C+N">Nikolay Atanasov</a>, <a href="/search/cs?searchtype=author&query=Nguyen%2C+T">Truong Nguyen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15468v1-abstract-short" style="display: inline;"> A signed distance function (SDF) is a useful representation for continuous-space geometry and many related operations, including rendering, collision checking, and mesh generation. Hence, reconstructing SDF from image observations accurately and efficiently is a fundamental problem. Recently, neural implicit SDF (SDF-NeRF) techniques, trained using volumetric rendering, have gained a lot of attent… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15468v1-abstract-full').style.display = 'inline'; document.getElementById('2411.15468v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15468v1-abstract-full" style="display: none;"> A signed distance function (SDF) is a useful representation for continuous-space geometry and many related operations, including rendering, collision checking, and mesh generation. Hence, reconstructing SDF from image observations accurately and efficiently is a fundamental problem. Recently, neural implicit SDF (SDF-NeRF) techniques, trained using volumetric rendering, have gained a lot of attention. Compared to earlier truncated SDF (TSDF) fusion algorithms that rely on depth maps and voxelize continuous space, SDF-NeRF enables continuous-space SDF reconstruction with better geometric and photometric accuracy. However, the accuracy and convergence speed of scene-level SDF reconstruction require further improvements for many applications. With the advent of 3D Gaussian Splatting (3DGS) as an explicit representation with excellent rendering quality and speed, several works have focused on improving SDF-NeRF by introducing consistency losses on depth and surface normals between 3DGS and SDF-NeRF. However, loss-level connections alone lead to incremental improvements. We propose a novel neural implicit SDF called "SplatSDF" to fuse 3DGSandSDF-NeRF at an architecture level with significant boosts to geometric and photometric accuracy and convergence speed. Our SplatSDF relies on 3DGS as input only during training, and keeps the same complexity and efficiency as the original SDF-NeRF during inference. Our method outperforms state-of-the-art SDF-NeRF models on geometric and photometric evaluation by the time of submission. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15468v1-abstract-full').style.display = 'none'; document.getElementById('2411.15468v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15355">arXiv:2411.15355</a> <span> [<a href="https://arxiv.org/pdf/2411.15355">pdf</a>, <a href="https://arxiv.org/format/2411.15355">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> UniGaussian: Driving Scene Reconstruction from Multiple Camera Models via Unified Gaussian Representations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ren%2C+Y">Yuan Ren</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+G">Guile Wu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+R">Runhao Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zheyuan Yang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yibo Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xingxin Chen</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+T">Tongtong Cao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+B">Bingbing Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15355v1-abstract-short" style="display: inline;"> Urban scene reconstruction is crucial for real-world autonomous driving simulators. Although existing methods have achieved photorealistic reconstruction, they mostly focus on pinhole cameras and neglect fisheye cameras. In fact, how to effectively simulate fisheye cameras in driving scene remains an unsolved problem. In this work, we propose UniGaussian, a novel approach that learns a unified 3D… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15355v1-abstract-full').style.display = 'inline'; document.getElementById('2411.15355v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15355v1-abstract-full" style="display: none;"> Urban scene reconstruction is crucial for real-world autonomous driving simulators. Although existing methods have achieved photorealistic reconstruction, they mostly focus on pinhole cameras and neglect fisheye cameras. In fact, how to effectively simulate fisheye cameras in driving scene remains an unsolved problem. In this work, we propose UniGaussian, a novel approach that learns a unified 3D Gaussian representation from multiple camera models for urban scene reconstruction in autonomous driving. Our contributions are two-fold. First, we propose a new differentiable rendering method that distorts 3D Gaussians using a series of affine transformations tailored to fisheye camera models. This addresses the compatibility issue of 3D Gaussian splatting with fisheye cameras, which is hindered by light ray distortion caused by lenses or mirrors. Besides, our method maintains real-time rendering while ensuring differentiability. Second, built on the differentiable rendering method, we design a new framework that learns a unified Gaussian representation from multiple camera models. By applying affine transformations to adapt different camera models and regularizing the shared Gaussians with supervision from different modalities, our framework learns a unified 3D Gaussian representation with input data from multiple sources and achieves holistic driving scene understanding. As a result, our approach models multiple sensors (pinhole and fisheye cameras) and modalities (depth, semantic, normal and LiDAR point clouds). Our experiments show that our method achieves superior rendering quality and fast rendering speed for driving scene simulation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15355v1-abstract-full').style.display = 'none'; document.getElementById('2411.15355v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Technical report</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15231">arXiv:2411.15231</a> <span> [<a href="https://arxiv.org/pdf/2411.15231">pdf</a>, <a href="https://arxiv.org/format/2411.15231">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> IterIS: Iterative Inference-Solving Alignment for LoRA Merging </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hongxu Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+R">Runshi Li</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+B">Bowei Zhu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhen Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Long Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15231v1-abstract-short" style="display: inline;"> Low-rank adaptations (LoRA) are widely used to fine-tune large models across various domains for specific downstream tasks. While task-specific LoRAs are often available, concerns about data privacy and intellectual property can restrict access to training data, limiting the acquisition of a multi-task model through gradient-based training. In response, LoRA merging presents an effective solution… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15231v1-abstract-full').style.display = 'inline'; document.getElementById('2411.15231v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15231v1-abstract-full" style="display: none;"> Low-rank adaptations (LoRA) are widely used to fine-tune large models across various domains for specific downstream tasks. While task-specific LoRAs are often available, concerns about data privacy and intellectual property can restrict access to training data, limiting the acquisition of a multi-task model through gradient-based training. In response, LoRA merging presents an effective solution by combining multiple LoRAs into a unified adapter while maintaining data privacy. Prior works on LoRA merging primarily frame it as an optimization problem, yet these approaches face several limitations, including the rough assumption about input features utilized in optimization, massive sample requirements, and the unbalanced optimization objective. These limitations can significantly degrade performance. To address these, we propose a novel optimization-based method, named IterIS: 1) We formulate LoRA merging as an advanced optimization problem to mitigate the rough assumption. Additionally, we employ an iterative inference-solving framework in our algorithm. It can progressively refine the optimization objective for improved performance. 2) We introduce an efficient regularization term to reduce the need for massive sample requirements (requiring only 1-5% of the unlabeled samples compared to prior methods). 3) We utilize adaptive weights in the optimization objective to mitigate potential unbalances in LoRA merging process. Our method demonstrates significant improvements over multiple baselines and state-of-the-art methods in composing tasks for text-to-image diffusion, vision-language models, and large language models. Furthermore, our layer-wise algorithm can achieve convergence with minimal steps, ensuring efficiency in both memory and computation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15231v1-abstract-full').style.display = 'none'; document.getElementById('2411.15231v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14174">arXiv:2411.14174</a> <span> [<a href="https://arxiv.org/pdf/2411.14174">pdf</a>, <a href="https://arxiv.org/format/2411.14174">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Programming Languages">cs.PL</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.14722/ndss.2025.241407">10.14722/ndss.2025.241407 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Translating C To Rust: Lessons from a User Study </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+R">Ruishi Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bo Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+T">Tianyu Li</a>, <a href="/search/cs?searchtype=author&query=Saxena%2C+P">Prateek Saxena</a>, <a href="/search/cs?searchtype=author&query=Kundu%2C+A">Ashish Kundu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14174v1-abstract-short" style="display: inline;"> Rust aims to offer full memory safety for programs, a guarantee that untamed C programs do not enjoy. How difficult is it to translate existing C code to Rust? To get a complementary view from that of automatic C to Rust translators, we report on a user study asking humans to translate real-world C programs to Rust. Our participants are able to produce safe Rust translations, whereas state-of-the-… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14174v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14174v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14174v1-abstract-full" style="display: none;"> Rust aims to offer full memory safety for programs, a guarantee that untamed C programs do not enjoy. How difficult is it to translate existing C code to Rust? To get a complementary view from that of automatic C to Rust translators, we report on a user study asking humans to translate real-world C programs to Rust. Our participants are able to produce safe Rust translations, whereas state-of-the-art automatic tools are not able to do so. Our analysis highlights that the high-level strategy taken by users departs significantly from those of automatic tools we study. We also find that users often choose zero-cost (static) abstractions for temporal safety, which addresses a predominant component of runtime costs in other full memory safety defenses. User-provided translations showcase a rich landscape of specialized strategies to translate the same C program in different ways to safe Rust, which future automatic translators can consider. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14174v1-abstract-full').style.display = 'none'; document.getElementById('2411.14174v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by NDSS Symposium 2025. Please cite the conference version of this paper, e.g., "Ruishi Li, Bo Wang, Tianyu Li, Prateek Saxena, Ashish Kundu. Translating C To Rust: Lessons from a User Study. In 32nd Annual Network and Distributed System Security Symposium (NDSS 2025)."</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13004">arXiv:2411.13004</a> <span> [<a href="https://arxiv.org/pdf/2411.13004">pdf</a>, <a href="https://arxiv.org/format/2411.13004">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> MERLOT: A Distilled LLM-based Mixture-of-Experts Framework for Scalable Encrypted Traffic Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yuxuan Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+R">Rongpeng Li</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Zhifeng Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Honggang Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13004v1-abstract-short" style="display: inline;"> We present MERLOT, a scalable mixture-of-expert (MoE) based refinement of distilled large language model optimized for encrypted traffic classification. By applying model distillation techniques in a teacher-student paradigm, compact models derived from GPT-2-base retain high classification accuracy while minimizing computational costs. These models function as specialized experts in an MoE archit… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13004v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13004v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13004v1-abstract-full" style="display: none;"> We present MERLOT, a scalable mixture-of-expert (MoE) based refinement of distilled large language model optimized for encrypted traffic classification. By applying model distillation techniques in a teacher-student paradigm, compact models derived from GPT-2-base retain high classification accuracy while minimizing computational costs. These models function as specialized experts in an MoE architecture, dynamically assigned via a gating network. Unlike generation-based methods, our approach directly classifies encrypted traffic using the final decoder token with contextual feature embedding as input. Experiments on 10 datasets show superior or competitive performance over the state-of-the-art models while significantly reducing resource demands, underscoring its effectiveness and robustness. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13004v1-abstract-full').style.display = 'none'; document.getElementById('2411.13004v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12814">arXiv:2411.12814</a> <span> [<a href="https://arxiv.org/pdf/2411.12814">pdf</a>, <a href="https://arxiv.org/format/2411.12814">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Interactive Medical Image Segmentation: A Benchmark Dataset and Baseline </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cheng%2C+J">Junlong Cheng</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+B">Bin Fu</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+J">Jin Ye</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guoan Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+T">Tianbin Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haoyu Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+R">Ruoyu Li</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+H">He Yao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Junren Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jingwen Li</a>, <a href="/search/cs?searchtype=author&query=Su%2C+Y">Yanzhou Su</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+M">Min Zhu</a>, <a href="/search/cs?searchtype=author&query=He%2C+J">Junjun He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12814v2-abstract-short" style="display: inline;"> Interactive Medical Image Segmentation (IMIS) has long been constrained by the limited availability of large-scale, diverse, and densely annotated datasets, which hinders model generalization and consistent evaluation across different models. In this paper, we introduce the IMed-361M benchmark dataset, a significant advancement in general IMIS research. First, we collect and standardize over 6.4 m… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12814v2-abstract-full').style.display = 'inline'; document.getElementById('2411.12814v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12814v2-abstract-full" style="display: none;"> Interactive Medical Image Segmentation (IMIS) has long been constrained by the limited availability of large-scale, diverse, and densely annotated datasets, which hinders model generalization and consistent evaluation across different models. In this paper, we introduce the IMed-361M benchmark dataset, a significant advancement in general IMIS research. First, we collect and standardize over 6.4 million medical images and their corresponding ground truth masks from multiple data sources. Then, leveraging the strong object recognition capabilities of a vision foundational model, we automatically generated dense interactive masks for each image and ensured their quality through rigorous quality control and granularity management. Unlike previous datasets, which are limited by specific modalities or sparse annotations, IMed-361M spans 14 modalities and 204 segmentation targets, totaling 361 million masks-an average of 56 masks per image. Finally, we developed an IMIS baseline network on this dataset that supports high-quality mask generation through interactive inputs, including clicks, bounding boxes, text prompts, and their combinations. We evaluate its performance on medical image segmentation tasks from multiple perspectives, demonstrating superior accuracy and scalability compared to existing interactive segmentation models. To facilitate research on foundational models in medical computer vision, we release the IMed-361M and model at https://github.com/uni-medical/IMIS-Bench. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12814v2-abstract-full').style.display = 'none'; document.getElementById('2411.12814v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11045">arXiv:2411.11045</a> <span> [<a href="https://arxiv.org/pdf/2411.11045">pdf</a>, <a href="https://arxiv.org/format/2411.11045">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> StableV2V: Stablizing Shape Consistency in Video-to-Video Editing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+C">Chang Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+R">Rui Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kaidong Zhang</a>, <a href="/search/cs?searchtype=author&query=Lan%2C+Y">Yunwei Lan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+D">Dong Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11045v1-abstract-short" style="display: inline;"> Recent advancements of generative AI have significantly promoted content creation and editing, where prevailing studies further extend this exciting progress to video editing. In doing so, these studies mainly transfer the inherent motion patterns from the source videos to the edited ones, where results with inferior consistency to user prompts are often observed, due to the lack of particular ali… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11045v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11045v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11045v1-abstract-full" style="display: none;"> Recent advancements of generative AI have significantly promoted content creation and editing, where prevailing studies further extend this exciting progress to video editing. In doing so, these studies mainly transfer the inherent motion patterns from the source videos to the edited ones, where results with inferior consistency to user prompts are often observed, due to the lack of particular alignments between the delivered motions and edited contents. To address this limitation, we present a shape-consistent video editing method, namely StableV2V, in this paper. Our method decomposes the entire editing pipeline into several sequential procedures, where it edits the first video frame, then establishes an alignment between the delivered motions and user prompts, and eventually propagates the edited contents to all other frames based on such alignment. Furthermore, we curate a testing benchmark, namely DAVIS-Edit, for a comprehensive evaluation of video editing, considering various types of prompts and difficulties. Experimental results and analyses illustrate the outperforming performance, visual consistency, and inference efficiency of our method compared to existing state-of-the-art studies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11045v1-abstract-full').style.display = 'none'; document.getElementById('2411.11045v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://alonzoleeeooo.github.io/StableV2V, code: https://github.com/AlonzoLeeeooo/StableV2V, model weights: https://huggingface.co/AlonzoLeeeooo/StableV2V, dataset (DAVIS-Edit): https://huggingface.co/datasets/AlonzoLeeeooo/DAVIS-Edit</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10258">arXiv:2411.10258</a> <span> [<a href="https://arxiv.org/pdf/2411.10258">pdf</a>, <a href="https://arxiv.org/format/2411.10258">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> MDHP-Net: Detecting Injection Attacks on In-vehicle Network using Multi-Dimensional Hawkes Process and Temporal Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qi Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yanchen Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+R">Ruifeng Li</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+C">Chenhong Cao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yufeng Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xingyu Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+P">Peng Wang</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+R">Runhan Feng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10258v1-abstract-short" style="display: inline;"> The integration of intelligent and connected technologies in modern vehicles, while offering enhanced functionalities through Electronic Control Unit and interfaces like OBD-II and telematics, also exposes the vehicle's in-vehicle network (IVN) to potential cyberattacks. In this paper, we consider a specific type of cyberattack known as the injection attack. As demonstrated by empirical data from… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10258v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10258v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10258v1-abstract-full" style="display: none;"> The integration of intelligent and connected technologies in modern vehicles, while offering enhanced functionalities through Electronic Control Unit and interfaces like OBD-II and telematics, also exposes the vehicle's in-vehicle network (IVN) to potential cyberattacks. In this paper, we consider a specific type of cyberattack known as the injection attack. As demonstrated by empirical data from real-world cybersecurity adversarial competitions(available at https://mimic2024.xctf.org.cn/race/qwmimic2024 ), these injection attacks have excitation effect over time, gradually manipulating network traffic and disrupting the vehicle's normal functioning, ultimately compromising both its stability and safety. To profile the abnormal behavior of attackers, we propose a novel injection attack detector to extract long-term features of attack behavior. Specifically, we first provide a theoretical analysis of modeling the time-excitation effects of the attack using Multi-Dimensional Hawkes Process (MDHP). A gradient descent solver specifically tailored for MDHP, MDHP-GDS, is developed to accurately estimate optimal MDHP parameters. We then propose an injection attack detector, MDHP-Net, which integrates optimal MDHP parameters with MDHP-LSTM blocks to enhance temporal feature extraction. By introducing MDHP parameters, MDHP-Net captures complex temporal features that standard Long Short-Term Memory (LSTM) cannot, enriching temporal dependencies within our customized structure. Extensive evaluations demonstrate the effectiveness of our proposed detection approach. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10258v1-abstract-full').style.display = 'none'; document.getElementById('2411.10258v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09472">arXiv:2411.09472</a> <span> [<a href="https://arxiv.org/pdf/2411.09472">pdf</a>, <a href="https://arxiv.org/ps/2411.09472">ps</a>, <a href="https://arxiv.org/format/2411.09472">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Data Structures and Algorithms">cs.DS</span> </div> </div> <p class="title is-5 mathjax"> An Algorithm for the Longest Common Subsequence and Substring Problem for Multiple Strings </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+R">Rao Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09472v2-abstract-short" style="display: inline;"> Let $X_1, X_2, ..., X_s$ and $Y_1, Y_2, ..., Y_t$ be strings over an alphabet $危$, where $s$ and $t$ are positive integers. The longest common subsequence and substring problem for multiple strings $X_1, X_2, ..., X_s$ and $Y_1, Y_2, ..., Y_t$ is to find the longest string which is a subsequence of $X_1, X_2, ..., X_s$ and a substring of $Y_1, Y_2, ..., Y_t$. In this paper, we propose an algorithm… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09472v2-abstract-full').style.display = 'inline'; document.getElementById('2411.09472v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09472v2-abstract-full" style="display: none;"> Let $X_1, X_2, ..., X_s$ and $Y_1, Y_2, ..., Y_t$ be strings over an alphabet $危$, where $s$ and $t$ are positive integers. The longest common subsequence and substring problem for multiple strings $X_1, X_2, ..., X_s$ and $Y_1, Y_2, ..., Y_t$ is to find the longest string which is a subsequence of $X_1, X_2, ..., X_s$ and a substring of $Y_1, Y_2, ..., Y_t$. In this paper, we propose an algorithm to solve the problem. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09472v2-abstract-full').style.display = 'none'; document.getElementById('2411.09472v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09250">arXiv:2411.09250</a> <span> [<a href="https://arxiv.org/pdf/2411.09250">pdf</a>, <a href="https://arxiv.org/format/2411.09250">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Embedding Space Allocation with Angle-Norm Joint Classifiers for Few-Shot Class-Incremental Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tu%2C+D">Dunwei Tu</a>, <a href="/search/cs?searchtype=author&query=Yi%2C+H">Huiyu Yi</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tieyi Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+R">Ruotong Li</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+F">Furao Shen</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+J">Jian Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09250v1-abstract-short" style="display: inline;"> Few-shot class-incremental learning (FSCIL) aims to continually learn new classes from only a few samples without forgetting previous ones, requiring intelligent agents to adapt to dynamic environments. FSCIL combines the characteristics and challenges of class-incremental learning and few-shot learning: (i) Current classes occupy the entire feature space, which is detrimental to learning new clas… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09250v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09250v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09250v1-abstract-full" style="display: none;"> Few-shot class-incremental learning (FSCIL) aims to continually learn new classes from only a few samples without forgetting previous ones, requiring intelligent agents to adapt to dynamic environments. FSCIL combines the characteristics and challenges of class-incremental learning and few-shot learning: (i) Current classes occupy the entire feature space, which is detrimental to learning new classes. (ii) The small number of samples in incremental rounds is insufficient for fully training. In existing mainstream virtual class methods, for addressing the challenge (i), they attempt to use virtual classes as placeholders. However, new classes may not necessarily align with the virtual classes. For the challenge (ii), they replace trainable fully connected layers with Nearest Class Mean (NCM) classifiers based on cosine similarity, but NCM classifiers do not account for sample imbalance issues. To address these issues in previous methods, we propose the class-center guided embedding Space Allocation with Angle-Norm joint classifiers (SAAN) learning framework, which provides balanced space for all classes and leverages norm differences caused by sample imbalance to enhance classification criteria. Specifically, for challenge (i), SAAN divides the feature space into multiple subspaces and allocates a dedicated subspace for each session by guiding samples with the pre-set category centers. For challenge (ii), SAAN establishes a norm distribution for each class and generates angle-norm joint logits. Experiments demonstrate that SAAN can achieve state-of-the-art performance and it can be directly embedded into other SOTA methods as a plug-in, further enhancing their performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09250v1-abstract-full').style.display = 'none'; document.getElementById('2411.09250v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.08545">arXiv:2411.08545</a> <span> [<a href="https://arxiv.org/pdf/2411.08545">pdf</a>, <a href="https://arxiv.org/format/2411.08545">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> APDDv2: Aesthetics of Paintings and Drawings Dataset with Artist Labeled Scores and Comments </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jin%2C+X">Xin Jin</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+Q">Qianqian Qiao</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Y">Yi Lu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Huaye Wang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+H">Heng Huang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+S">Shan Gao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jianfei Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+R">Rui Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.08545v1-abstract-short" style="display: inline;"> Datasets play a pivotal role in training visual models, facilitating the development of abstract understandings of visual features through diverse image samples and multidimensional attributes. However, in the realm of aesthetic evaluation of artistic images, datasets remain relatively scarce. Existing painting datasets are often characterized by limited scoring dimensions and insufficient annotat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08545v1-abstract-full').style.display = 'inline'; document.getElementById('2411.08545v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.08545v1-abstract-full" style="display: none;"> Datasets play a pivotal role in training visual models, facilitating the development of abstract understandings of visual features through diverse image samples and multidimensional attributes. However, in the realm of aesthetic evaluation of artistic images, datasets remain relatively scarce. Existing painting datasets are often characterized by limited scoring dimensions and insufficient annotations, thereby constraining the advancement and application of automatic aesthetic evaluation methods in the domain of painting. To bridge this gap, we introduce the Aesthetics Paintings and Drawings Dataset (APDD), the first comprehensive collection of paintings encompassing 24 distinct artistic categories and 10 aesthetic attributes. Building upon the initial release of APDDv1, our ongoing research has identified opportunities for enhancement in data scale and annotation precision. Consequently, APDDv2 boasts an expanded image corpus and improved annotation quality, featuring detailed language comments to better cater to the needs of both researchers and practitioners seeking high-quality painting datasets. Furthermore, we present an updated version of the Art Assessment Network for Specific Painting Styles, denoted as ArtCLIP. Experimental validation demonstrates the superior performance of this revised model in the realm of aesthetic evaluation, surpassing its predecessor in accuracy and efficacy. The dataset and model are available at https://github.com/BestiVictory/APDDv2.git. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08545v1-abstract-full').style.display = 'none'; document.getElementById('2411.08545v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.08488">arXiv:2411.08488</a> <span> [<a href="https://arxiv.org/pdf/2411.08488">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> UNSCT-HRNet: Modeling Anatomical Uncertainty for Landmark Detection in Total Hip Arthroplasty </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wan%2C+J">Jiaxin Wan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Lin Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haoran Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Liangwei Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wei Li</a>, <a href="/search/cs?searchtype=author&query=Kou%2C+S">Shuheng Kou</a>, <a href="/search/cs?searchtype=author&query=Li%2C+R">Runtian Li</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+J">Jiayi Tang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Juanxiu Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jing Zhang</a>, <a href="/search/cs?searchtype=author&query=Du%2C+X">Xiaohui Du</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+R">Ruqian Hao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.08488v1-abstract-short" style="display: inline;"> Total hip arthroplasty (THA) relies on accurate landmark detection from radiographic images, but unstructured data caused by irregular patient postures or occluded anatomical markers pose significant challenges for existing methods. To address this, we propose UNSCT-HRNet (Unstructured CT - High-Resolution Net), a deep learning-based framework that integrates a Spatial Relationship Fusion (SRF) mo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08488v1-abstract-full').style.display = 'inline'; document.getElementById('2411.08488v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.08488v1-abstract-full" style="display: none;"> Total hip arthroplasty (THA) relies on accurate landmark detection from radiographic images, but unstructured data caused by irregular patient postures or occluded anatomical markers pose significant challenges for existing methods. To address this, we propose UNSCT-HRNet (Unstructured CT - High-Resolution Net), a deep learning-based framework that integrates a Spatial Relationship Fusion (SRF) module and an Uncertainty Estimation (UE) module. The SRF module, utilizing coordinate convolution and polarized attention, enhances the model's ability to capture complex spatial relationships. Meanwhile, the UE module which based on entropy ensures predictions are anatomically relevant. For unstructured data, the proposed method can predict landmarks without relying on the fixed number of points, which shows higher accuracy and better robustness comparing with the existing methods. Our UNSCT-HRNet demonstrates over a 60% improvement across multiple metrics in unstructured data. The experimental results also reveal that our approach maintains good performance on the structured dataset. Overall, the proposed UNSCT-HRNet has the potential to be used as a new reliable, automated solution for THA surgical planning and postoperative monitoring. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08488v1-abstract-full').style.display = 'none'; document.getElementById('2411.08488v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06135">arXiv:2411.06135</a> <span> [<a href="https://arxiv.org/pdf/2411.06135">pdf</a>, <a href="https://arxiv.org/format/2411.06135">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> Online Parallel Multi-Task Relationship Learning via Alternating Direction Method of Multipliers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+R">Ruiyu Li</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+P">Peilin Zhao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Guangxia Li</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zhiqiang Xu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xuewei Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06135v1-abstract-short" style="display: inline;"> Online multi-task learning (OMTL) enhances streaming data processing by leveraging the inherent relations among multiple tasks. It can be described as an optimization problem in which a single loss function is defined for multiple tasks. Existing gradient-descent-based methods for this problem might suffer from gradient vanishing and poor conditioning issues. Furthermore, the centralized setting h… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06135v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06135v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06135v1-abstract-full" style="display: none;"> Online multi-task learning (OMTL) enhances streaming data processing by leveraging the inherent relations among multiple tasks. It can be described as an optimization problem in which a single loss function is defined for multiple tasks. Existing gradient-descent-based methods for this problem might suffer from gradient vanishing and poor conditioning issues. Furthermore, the centralized setting hinders their application to online parallel optimization, which is vital to big data analytics. Therefore, this study proposes a novel OMTL framework based on the alternating direction multiplier method (ADMM), a recent breakthrough in optimization suitable for the distributed computing environment because of its decomposable and easy-to-implement nature. The relations among multiple tasks are modeled dynamically to fit the constant changes in an online scenario. In a classical distributed computing architecture with a central server, the proposed OMTL algorithm with the ADMM optimizer outperforms SGD-based approaches in terms of accuracy and efficiency. Because the central server might become a bottleneck when the data scale grows, we further tailor the algorithm to a decentralized setting, so that each node can work by only exchanging information with local neighbors. Experimental results on a synthetic and several real-world datasets demonstrate the efficiency of our methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06135v1-abstract-full').style.display = 'none'; document.getElementById('2411.06135v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accpeted by Neurocomputing</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04680">arXiv:2411.04680</a> <span> [<a href="https://arxiv.org/pdf/2411.04680">pdf</a>, <a href="https://arxiv.org/format/2411.04680">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Differentially Private Continual Learning using Pre-Trained Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tobaben%2C+M">Marlon Tobaben</a>, <a href="/search/cs?searchtype=author&query=Klasson%2C+M">Marcus Klasson</a>, <a href="/search/cs?searchtype=author&query=Li%2C+R">Rui Li</a>, <a href="/search/cs?searchtype=author&query=Solin%2C+A">Arno Solin</a>, <a href="/search/cs?searchtype=author&query=Honkela%2C+A">Antti Honkela</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04680v2-abstract-short" style="display: inline;"> This work explores the intersection of continual learning (CL) and differential privacy (DP). Crucially, continual learning models must retain knowledge across tasks, but this conflicts with the differential privacy requirement of restricting individual samples to be memorised in the model. We propose using pre-trained models to address the trade-offs between privacy and performance in a continual… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04680v2-abstract-full').style.display = 'inline'; document.getElementById('2411.04680v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04680v2-abstract-full" style="display: none;"> This work explores the intersection of continual learning (CL) and differential privacy (DP). Crucially, continual learning models must retain knowledge across tasks, but this conflicts with the differential privacy requirement of restricting individual samples to be memorised in the model. We propose using pre-trained models to address the trade-offs between privacy and performance in a continual learning setting. More specifically, we present necessary assumptions to enable privacy-preservation and propose combining pre-trained models with parameter-free classifiers and parameter-efficient adapters that are learned under differential privacy. Our experiments demonstrate their effectiveness and provide insights into balancing the competing demands of continual learning and privacy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04680v2-abstract-full').style.display = 'none'; document.getElementById('2411.04680v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages, 3 figures, Accepted at Scalable Continual Learning for Lifelong Foundation Models Workshop at 38th Conference on Neural Information Processing Systems (NeurIPS 2024)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04106">arXiv:2411.04106</a> <span> [<a href="https://arxiv.org/pdf/2411.04106">pdf</a>, <a href="https://arxiv.org/format/2411.04106">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> A Comparative Study of Deep Reinforcement Learning for Crop Production Management </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Balderas%2C+J">Joseph Balderas</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+D">Dong Chen</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yanbo Huang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Li Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+R">Ren-Cang Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04106v1-abstract-short" style="display: inline;"> Crop production management is essential for optimizing yield and minimizing a field's environmental impact to crop fields, yet it remains challenging due to the complex and stochastic processes involved. Recently, researchers have turned to machine learning to address these complexities. Specifically, reinforcement learning (RL), a cutting-edge approach designed to learn optimal decision-making st… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04106v1-abstract-full').style.display = 'inline'; document.getElementById('2411.04106v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04106v1-abstract-full" style="display: none;"> Crop production management is essential for optimizing yield and minimizing a field's environmental impact to crop fields, yet it remains challenging due to the complex and stochastic processes involved. Recently, researchers have turned to machine learning to address these complexities. Specifically, reinforcement learning (RL), a cutting-edge approach designed to learn optimal decision-making strategies through trial and error in dynamic environments, has emerged as a promising tool for developing adaptive crop management policies. RL models aim to optimize long-term rewards by continuously interacting with the environment, making them well-suited for tackling the uncertainties and variability inherent in crop management. Studies have shown that RL can generate crop management policies that compete with, and even outperform, expert-designed policies within simulation-based crop models. In the gym-DSSAT crop model environment, one of the most widely used simulators for crop management, proximal policy optimization (PPO) and deep Q-networks (DQN) have shown promising results. However, these methods have not yet been systematically evaluated under identical conditions. In this study, we evaluated PPO and DQN against static baseline policies across three different RL tasks, fertilization, irrigation, and mixed management, provided by the gym-DSSAT environment. To ensure a fair comparison, we used consistent default parameters, identical reward functions, and the same environment settings. Our results indicate that PPO outperforms DQN in fertilization and irrigation tasks, while DQN excels in the mixed management task. This comparative analysis provides critical insights into the strengths and limitations of each approach, advancing the development of more effective RL-based crop management strategies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04106v1-abstract-full').style.display = 'none'; document.getElementById('2411.04106v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03769">arXiv:2411.03769</a> <span> [<a href="https://arxiv.org/pdf/2411.03769">pdf</a>, <a href="https://arxiv.org/format/2411.03769">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> No Culture Left Behind: ArtELingo-28, a Benchmark of WikiArt with Captions in 28 Languages </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mohamed%2C+Y">Youssef Mohamed</a>, <a href="/search/cs?searchtype=author&query=Li%2C+R">Runjia Li</a>, <a href="/search/cs?searchtype=author&query=Ahmad%2C+I+S">Ibrahim Said Ahmad</a>, <a href="/search/cs?searchtype=author&query=Haydarov%2C+K">Kilichbek Haydarov</a>, <a href="/search/cs?searchtype=author&query=Torr%2C+P">Philip Torr</a>, <a href="/search/cs?searchtype=author&query=Church%2C+K+W">Kenneth Ward Church</a>, <a href="/search/cs?searchtype=author&query=Elhoseiny%2C+M">Mohamed Elhoseiny</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03769v1-abstract-short" style="display: inline;"> Research in vision and language has made considerable progress thanks to benchmarks such as COCO. COCO captions focused on unambiguous facts in English; ArtEmis introduced subjective emotions and ArtELingo introduced some multilinguality (Chinese and Arabic). However we believe there should be more multilinguality. Hence, we present ArtELingo-28, a vision-language benchmark that spans… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03769v1-abstract-full').style.display = 'inline'; document.getElementById('2411.03769v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03769v1-abstract-full" style="display: none;"> Research in vision and language has made considerable progress thanks to benchmarks such as COCO. COCO captions focused on unambiguous facts in English; ArtEmis introduced subjective emotions and ArtELingo introduced some multilinguality (Chinese and Arabic). However we believe there should be more multilinguality. Hence, we present ArtELingo-28, a vision-language benchmark that spans $\textbf{28}$ languages and encompasses approximately $\textbf{200,000}$ annotations ($\textbf{140}$ annotations per image). Traditionally, vision research focused on unambiguous class labels, whereas ArtELingo-28 emphasizes diversity of opinions over languages and cultures. The challenge is to build machine learning systems that assign emotional captions to images. Baseline results will be presented for three novel conditions: Zero-Shot, Few-Shot and One-vs-All Zero-Shot. We find that cross-lingual transfer is more successful for culturally-related languages. Data and code are provided at www.artelingo.org. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03769v1-abstract-full').style.display = 'none'; document.getElementById('2411.03769v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages, Accepted at EMNLP 24, for more details see www.artelingo.org</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03350">arXiv:2411.03350</a> <span> [<a href="https://arxiv.org/pdf/2411.03350">pdf</a>, <a href="https://arxiv.org/format/2411.03350">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> A Comprehensive Survey of Small Language Models in the Era of Large Language Models: Techniques, Enhancements, Applications, Collaboration with LLMs, and Trustworthiness </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+F">Fali Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhiwei Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xianren Zhang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Zongyu Wu</a>, <a href="/search/cs?searchtype=author&query=Mo%2C+T">Tzuhao Mo</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Q">Qiuhao Lu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wanjing Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+R">Rui Li</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Junjie Xu</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+X">Xianfeng Tang</a>, <a href="/search/cs?searchtype=author&query=He%2C+Q">Qi He</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Y">Yao Ma</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+M">Ming Huang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Suhang Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03350v1-abstract-short" style="display: inline;"> Large language models (LLM) have demonstrated emergent abilities in text generation, question answering, and reasoning, facilitating various tasks and domains. Despite their proficiency in various tasks, LLMs like LaPM 540B and Llama-3.1 405B face limitations due to large parameter sizes and computational demands, often requiring cloud API use which raises privacy concerns, limits real-time applic… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03350v1-abstract-full').style.display = 'inline'; document.getElementById('2411.03350v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03350v1-abstract-full" style="display: none;"> Large language models (LLM) have demonstrated emergent abilities in text generation, question answering, and reasoning, facilitating various tasks and domains. Despite their proficiency in various tasks, LLMs like LaPM 540B and Llama-3.1 405B face limitations due to large parameter sizes and computational demands, often requiring cloud API use which raises privacy concerns, limits real-time applications on edge devices, and increases fine-tuning costs. Additionally, LLMs often underperform in specialized domains such as healthcare and law due to insufficient domain-specific knowledge, necessitating specialized models. Therefore, Small Language Models (SLMs) are increasingly favored for their low inference latency, cost-effectiveness, efficient development, and easy customization and adaptability. These models are particularly well-suited for resource-limited environments and domain knowledge acquisition, addressing LLMs' challenges and proving ideal for applications that require localized data handling for privacy, minimal inference latency for efficiency, and domain knowledge acquisition through lightweight fine-tuning. The rising demand for SLMs has spurred extensive research and development. However, a comprehensive survey investigating issues related to the definition, acquisition, application, enhancement, and reliability of SLM remains lacking, prompting us to conduct a detailed survey on these topics. The definition of SLMs varies widely, thus to standardize, we propose defining SLMs by their capability to perform specialized tasks and suitability for resource-constrained settings, setting boundaries based on the minimal size for emergent abilities and the maximum size sustainable under resource constraints. For other aspects, we provide a taxonomy of relevant models/methods and develop general frameworks for each category to enhance and utilize SLMs effectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03350v1-abstract-full').style.display = 'none'; document.getElementById('2411.03350v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">76 pages, 26 figures, 14 tables</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68T50 (Primary) 68T07 (Secondary) <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.7 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01897">arXiv:2411.01897</a> <span> [<a href="https://arxiv.org/pdf/2411.01897">pdf</a>, <a href="https://arxiv.org/format/2411.01897">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> LE-PDE++: Mamba for accelerating PDEs Simulations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liang%2C+A">Aoming Liang</a>, <a href="/search/cs?searchtype=author&query=Mu%2C+Z">Zhaoyang Mu</a>, <a href="/search/cs?searchtype=author&query=liu%2C+Q">Qi liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+R">Ruipeng Li</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+M">Mingming Ge</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+D">Dixia Fan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01897v2-abstract-short" style="display: inline;"> Partial Differential Equations are foundational in modeling science and natural systems such as fluid dynamics and weather forecasting. The Latent Evolution of PDEs method is designed to address the computational intensity of classical and deep learning-based PDE solvers by proposing a scalable and efficient alternative. To enhance the efficiency and accuracy of LE-PDE, we incorporate the Mamba mo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01897v2-abstract-full').style.display = 'inline'; document.getElementById('2411.01897v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01897v2-abstract-full" style="display: none;"> Partial Differential Equations are foundational in modeling science and natural systems such as fluid dynamics and weather forecasting. The Latent Evolution of PDEs method is designed to address the computational intensity of classical and deep learning-based PDE solvers by proposing a scalable and efficient alternative. To enhance the efficiency and accuracy of LE-PDE, we incorporate the Mamba model, an advanced machine learning model known for its predictive efficiency and robustness in handling complex dynamic systems with a progressive learning strategy. The LE-PDE was tested on several benchmark problems. The method demonstrated a marked reduction in computational time compared to traditional solvers and standalone deep learning models while maintaining high accuracy in predicting system behavior over time. Our method doubles the inference speed compared to the LE-PDE while retaining the same level of parameter efficiency, making it well-suited for scenarios requiring long-term predictions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01897v2-abstract-full').style.display = 'none'; document.getElementById('2411.01897v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00761">arXiv:2411.00761</a> <span> [<a href="https://arxiv.org/pdf/2411.00761">pdf</a>, <a href="https://arxiv.org/format/2411.00761">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> </div> </div> <p class="title is-5 mathjax"> LCP: Enhancing Scientific Data Management with Lossy Compression for Particles </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Longtao Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+R">Ruoyu Li</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+C">Congrong Ren</a>, <a href="/search/cs?searchtype=author&query=Di%2C+S">Sheng Di</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jinyang Liu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jiajun Huang</a>, <a href="/search/cs?searchtype=author&query=Underwood%2C+R">Robert Underwood</a>, <a href="/search/cs?searchtype=author&query=Grosset%2C+P">Pascal Grosset</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+D">Dingwen Tao</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+X">Xin Liang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+H">Hanqi Guo</a>, <a href="/search/cs?searchtype=author&query=Capello%2C+F">Franck Capello</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+K">Kai Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00761v1-abstract-short" style="display: inline;"> Many scientific applications opt for particles instead of meshes as their basic primitives to model complex systems composed of billions of discrete entities. Such applications span a diverse array of scientific domains, including molecular dynamics, cosmology, computational fluid dynamics, and geology. The scale of the particles in those scientific applications increases substantially thanks to t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00761v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00761v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00761v1-abstract-full" style="display: none;"> Many scientific applications opt for particles instead of meshes as their basic primitives to model complex systems composed of billions of discrete entities. Such applications span a diverse array of scientific domains, including molecular dynamics, cosmology, computational fluid dynamics, and geology. The scale of the particles in those scientific applications increases substantially thanks to the ever-increasing computational power in high-performance computing (HPC) platforms. However, the actual gains from such increases are often undercut by obstacles in data management systems related to data storage, transfer, and processing. Lossy compression has been widely recognized as a promising solution to enhance scientific data management systems regarding such challenges, although most existing compression solutions are tailored for Cartesian grids and thus have sub-optimal results on discrete particle data. In this paper, we introduce LCP, an innovative lossy compressor designed for particle datasets, offering superior compression quality and higher speed than existing compression solutions. Specifically, our contribution is threefold. (1) We propose LCP-S, an error-bound aware block-wise spatial compressor to efficiently reduce particle data size. This approach is universally applicable to particle data across various domains. (2) We develop LCP, a hybrid compression solution for multi-frame particle data, featuring dynamic method selection and parameter optimization. (3) We evaluate our solution alongside eight state-of-the-art alternatives on eight real-world particle datasets from seven distinct domains. The results demonstrate that our solution achieves up to 104% improvement in compression ratios and up to 593% increase in speed compared to the second-best option, under the same error criteria. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00761v1-abstract-full').style.display = 'none'; document.getElementById('2411.00761v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by SIGMOD'25</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00422">arXiv:2411.00422</a> <span> [<a href="https://arxiv.org/pdf/2411.00422">pdf</a>, <a href="https://arxiv.org/format/2411.00422">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> MAP the Blockchain World: A Trustless and Scalable Blockchain Interoperability Protocol for Cross-chain Applications </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cao%2C+Y">Yinfeng Cao</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+J">Jiannong Cao</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+D">Dongbin Bai</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+L">Long Wen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yang Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+R">Ruidong Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00422v1-abstract-short" style="display: inline;"> Blockchain interoperability protocols enable cross-chain asset transfers or data retrievals between isolated chains, which are considered as the core infrastructure for Web 3.0 applications such as decentralized finance protocols. However, existing protocols either face severe scalability issues due to high on-chain and off-chain costs, or suffer from trust concerns because of centralized designs.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00422v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00422v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00422v1-abstract-full" style="display: none;"> Blockchain interoperability protocols enable cross-chain asset transfers or data retrievals between isolated chains, which are considered as the core infrastructure for Web 3.0 applications such as decentralized finance protocols. However, existing protocols either face severe scalability issues due to high on-chain and off-chain costs, or suffer from trust concerns because of centralized designs. In this paper, we propose \texttt{MAP}, a trustless blockchain interoperability protocol that relays cross-chain transactions across heterogeneous chains with high scalability. First, within \texttt{MAP}, we develop a novel \textit{cross-chain relay} technique, which integrates a unified relay chain architecture and on-chain light clients of different source chains, allowing the retrieval and verification of diverse cross-chain transactions. Furthermore, we reduce cross-chain verification costs by incorporating an optimized zk-based light client scheme that adaptively decouples signature verification overheads from inefficient smart contract execution and offloads them to off-chain provers. For experiments, we conducted the first large-scale evaluation on existing interoperability protocols. With \texttt{MAP}, the required number of on-chain light clients is reduced from $O(N^2)$ to $O(N)$, with around 35\% reduction in on-chain costs and 25\% reduction for off-chain costs when verifying cross-chain transactions. To demonstrate the effectiveness, we deployed \texttt{MAP} in the real world. By 2024, we have supported over six popular public chains, 50 cross-chain applications and relayed over 200K cross-chain transactions worth over 640 million USD. Based on rich practical experiences, we constructed the first real-world cross-chain dataset to further advance blockchain interoperability research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00422v1-abstract-full').style.display = 'none'; document.getElementById('2411.00422v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23718">arXiv:2410.23718</a> <span> [<a href="https://arxiv.org/pdf/2410.23718">pdf</a>, <a href="https://arxiv.org/format/2410.23718">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> GaussianMarker: Uncertainty-Aware Copyright Protection of 3D Gaussian Splatting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+X">Xiufeng Huang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+R">Ruiqi Li</a>, <a href="/search/cs?searchtype=author&query=Cheung%2C+Y">Yiu-ming Cheung</a>, <a href="/search/cs?searchtype=author&query=Cheung%2C+K+C">Ka Chun Cheung</a>, <a href="/search/cs?searchtype=author&query=See%2C+S">Simon See</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+R">Renjie Wan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23718v1-abstract-short" style="display: inline;"> 3D Gaussian Splatting (3DGS) has become a crucial method for acquiring 3D assets. To protect the copyright of these assets, digital watermarking techniques can be applied to embed ownership information discreetly within 3DGS models. However, existing watermarking methods for meshes, point clouds, and implicit radiance fields cannot be directly applied to 3DGS models, as 3DGS models use explicit 3D… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23718v1-abstract-full').style.display = 'inline'; document.getElementById('2410.23718v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23718v1-abstract-full" style="display: none;"> 3D Gaussian Splatting (3DGS) has become a crucial method for acquiring 3D assets. To protect the copyright of these assets, digital watermarking techniques can be applied to embed ownership information discreetly within 3DGS models. However, existing watermarking methods for meshes, point clouds, and implicit radiance fields cannot be directly applied to 3DGS models, as 3DGS models use explicit 3D Gaussians with distinct structures and do not rely on neural networks. Naively embedding the watermark on a pre-trained 3DGS can cause obvious distortion in rendered images. In our work, we propose an uncertainty-based method that constrains the perturbation of model parameters to achieve invisible watermarking for 3DGS. At the message decoding stage, the copyright messages can be reliably extracted from both 3D Gaussians and 2D rendered images even under various forms of 3D and 2D distortions. We conduct extensive experiments on the Blender, LLFF and MipNeRF-360 datasets to validate the effectiveness of our proposed method, demonstrating state-of-the-art performance on both message decoding accuracy and view synthesis quality. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23718v1-abstract-full').style.display = 'none'; document.getElementById('2410.23718v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23092">arXiv:2410.23092</a> <span> [<a href="https://arxiv.org/pdf/2410.23092">pdf</a>, <a href="https://arxiv.org/format/2410.23092">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> First Place Solution to the ECCV 2024 ROAD++ Challenge @ ROAD++ Atomic Activity Recognition 2024 </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+R">Ruyang Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tengfei Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Heng Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+T">Tiejun Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yanwei Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xuelei Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23092v1-abstract-short" style="display: inline;"> This report presents our team's technical solution for participating in Track 3 of the 2024 ECCV ROAD++ Challenge. The task of Track 3 is atomic activity recognition, which aims to identify 64 types of atomic activities in road scenes based on video content. Our approach primarily addresses the challenges of small objects, discriminating between single object and a group of objects, as well as mod… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23092v1-abstract-full').style.display = 'inline'; document.getElementById('2410.23092v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23092v1-abstract-full" style="display: none;"> This report presents our team's technical solution for participating in Track 3 of the 2024 ECCV ROAD++ Challenge. The task of Track 3 is atomic activity recognition, which aims to identify 64 types of atomic activities in road scenes based on video content. Our approach primarily addresses the challenges of small objects, discriminating between single object and a group of objects, as well as model overfitting in this task. Firstly, we construct a multi-branch activity recognition framework that not only separates different object categories but also the tasks of single object and object group recognition, thereby enhancing recognition accuracy. Subsequently, we develop various model ensembling strategies, including integrations of multiple frame sampling sequences, different frame sampling sequence lengths, multiple training epochs, and different backbone networks. Furthermore, we propose an atomic activity recognition data augmentation method, which greatly expands the sample space by flipping video frames and road topology, effectively mitigating model overfitting. Our methods rank first in the test set of Track 3 for the ROAD++ Challenge 2024, and achieve 69% mAP. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23092v1-abstract-full').style.display = 'none'; document.getElementById('2410.23092v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23077">arXiv:2410.23077</a> <span> [<a href="https://arxiv.org/pdf/2410.23077">pdf</a>, <a href="https://arxiv.org/format/2410.23077">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> First Place Solution to the ECCV 2024 ROAD++ Challenge @ ROAD++ Spatiotemporal Agent Detection 2024 </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tengfei Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Heng Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+R">Ruyang Li</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+Q">Qi Deng</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yaqian Zhao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+R">Rengang Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23077v1-abstract-short" style="display: inline;"> This report presents our team's solutions for the Track 1 of the 2024 ECCV ROAD++ Challenge. The task of Track 1 is spatiotemporal agent detection, which aims to construct an "agent tube" for road agents in consecutive video frames. Our solutions focus on the challenges in this task, including extreme-size objects, low-light scenarios, class imbalance, and fine-grained classification. Firstly, the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23077v1-abstract-full').style.display = 'inline'; document.getElementById('2410.23077v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23077v1-abstract-full" style="display: none;"> This report presents our team's solutions for the Track 1 of the 2024 ECCV ROAD++ Challenge. The task of Track 1 is spatiotemporal agent detection, which aims to construct an "agent tube" for road agents in consecutive video frames. Our solutions focus on the challenges in this task, including extreme-size objects, low-light scenarios, class imbalance, and fine-grained classification. Firstly, the extreme-size object detection heads are introduced to improve the detection performance of large and small objects. Secondly, we design a dual-stream detection model with a low-light enhancement stream to improve the performance of spatiotemporal agent detection in low-light scenes, and the feature fusion module to integrate features from different branches. Subsequently, we develop a multi-branch detection framework to mitigate the issues of class imbalance and fine-grained classification, and we design a pre-training and fine-tuning approach to optimize the above multi-branch framework. Besides, we employ some common data augmentation techniques, and improve the loss function and upsampling operation. We rank first in the test set of Track 1 for the ROAD++ Challenge 2024, and achieve 30.82% average video-mAP. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23077v1-abstract-full').style.display = 'none'; document.getElementById('2410.23077v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.22489">arXiv:2410.22489</a> <span> [<a href="https://arxiv.org/pdf/2410.22489">pdf</a>, <a href="https://arxiv.org/format/2410.22489">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Multimodality Helps Few-Shot 3D Point Cloud Semantic Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=An%2C+Z">Zhaochong An</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+G">Guolei Sun</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yun Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+R">Runjia Li</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+M">Min Wu</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+M">Ming-Ming Cheng</a>, <a href="/search/cs?searchtype=author&query=Konukoglu%2C+E">Ender Konukoglu</a>, <a href="/search/cs?searchtype=author&query=Belongie%2C+S">Serge Belongie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.22489v2-abstract-short" style="display: inline;"> Few-shot 3D point cloud segmentation (FS-PCS) aims at generalizing models to segment novel categories with minimal annotated support samples. While existing FS-PCS methods have shown promise, they primarily focus on unimodal point cloud inputs, overlooking the potential benefits of leveraging multimodal information. In this paper, we address this gap by introducing a cost-free multimodal FS-PCS se… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22489v2-abstract-full').style.display = 'inline'; document.getElementById('2410.22489v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.22489v2-abstract-full" style="display: none;"> Few-shot 3D point cloud segmentation (FS-PCS) aims at generalizing models to segment novel categories with minimal annotated support samples. While existing FS-PCS methods have shown promise, they primarily focus on unimodal point cloud inputs, overlooking the potential benefits of leveraging multimodal information. In this paper, we address this gap by introducing a cost-free multimodal FS-PCS setup, utilizing textual labels and the potentially available 2D image modality. Under this easy-to-achieve setup, we present the MultiModal Few-Shot SegNet (MM-FSS), a model effectively harnessing complementary information from multiple modalities. MM-FSS employs a shared backbone with two heads to extract intermodal and unimodal visual features, and a pretrained text encoder to generate text embeddings. To fully exploit the multimodal information, we propose a Multimodal Correlation Fusion (MCF) module to generate multimodal correlations, and a Multimodal Semantic Fusion (MSF) module to refine the correlations using text-aware semantic guidance. Additionally, we propose a simple yet effective Test-time Adaptive Cross-modal Calibration (TACC) technique to mitigate training bias, further improving generalization. Experimental results on S3DIS and ScanNet datasets demonstrate significant performance improvements achieved by our method. The efficacy of our approach indicates the benefits of leveraging commonly-ignored free modalities for FS-PCS, providing valuable insights for future research. The code is available at https://github.com/ZhaochongAn/Multimodality-3D-Few-Shot <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22489v2-abstract-full').style.display = 'none'; document.getElementById('2410.22489v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.22135">arXiv:2410.22135</a> <span> [<a href="https://arxiv.org/pdf/2410.22135">pdf</a>, <a href="https://arxiv.org/format/2410.22135">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Lightweight Frequency Masker for Cross-Domain Few-Shot Semantic Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tong%2C+J">Jintao Tong</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+Y">Yixiong Zou</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yuhua Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+R">Ruixuan Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.22135v2-abstract-short" style="display: inline;"> Cross-domain few-shot segmentation (CD-FSS) is proposed to first pre-train the model on a large-scale source-domain dataset, and then transfer the model to data-scarce target-domain datasets for pixel-level segmentation. The significant domain gap between the source and target datasets leads to a sharp decline in the performance of existing few-shot segmentation (FSS) methods in cross-domain scena… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22135v2-abstract-full').style.display = 'inline'; document.getElementById('2410.22135v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.22135v2-abstract-full" style="display: none;"> Cross-domain few-shot segmentation (CD-FSS) is proposed to first pre-train the model on a large-scale source-domain dataset, and then transfer the model to data-scarce target-domain datasets for pixel-level segmentation. The significant domain gap between the source and target datasets leads to a sharp decline in the performance of existing few-shot segmentation (FSS) methods in cross-domain scenarios. In this work, we discover an intriguing phenomenon: simply filtering different frequency components for target domains can lead to a significant performance improvement, sometimes even as high as 14% mIoU. Then, we delve into this phenomenon for an interpretation, and find such improvements stem from the reduced inter-channel correlation in feature maps, which benefits CD-FSS with enhanced robustness against domain gaps and larger activated regions for segmentation. Based on this, we propose a lightweight frequency masker, which further reduces channel correlations by an Amplitude-Phase Masker (APM) module and an Adaptive Channel Phase Attention (ACPA) module. Notably, APM introduces only 0.01% additional parameters but improves the average performance by over 10%, and ACPA imports only 2.5% parameters but further improves the performance by over 1.5%, which significantly surpasses the state-of-the-art CD-FSS methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22135v2-abstract-full').style.display = 'none'; document.getElementById('2410.22135v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.21794">arXiv:2410.21794</a> <span> [<a href="https://arxiv.org/pdf/2410.21794">pdf</a>, <a href="https://arxiv.org/format/2410.21794">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> </div> <p class="title is-5 mathjax"> Inverse Attention Agent for Multi-Agent System </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Long%2C+Q">Qian Long</a>, <a href="/search/cs?searchtype=author&query=Li%2C+R">Ruoyan Li</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+M">Minglu Zhao</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+T">Tao Gao</a>, <a href="/search/cs?searchtype=author&query=Terzopoulos%2C+D">Demetri Terzopoulos</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.21794v1-abstract-short" style="display: inline;"> A major challenge for Multi-Agent Systems is enabling agents to adapt dynamically to diverse environments in which opponents and teammates may continually change. Agents trained using conventional methods tend to excel only within the confines of their training cohorts; their performance drops significantly when confronting unfamiliar agents. To address this shortcoming, we introduce Inverse Atten… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21794v1-abstract-full').style.display = 'inline'; document.getElementById('2410.21794v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.21794v1-abstract-full" style="display: none;"> A major challenge for Multi-Agent Systems is enabling agents to adapt dynamically to diverse environments in which opponents and teammates may continually change. Agents trained using conventional methods tend to excel only within the confines of their training cohorts; their performance drops significantly when confronting unfamiliar agents. To address this shortcoming, we introduce Inverse Attention Agents that adopt concepts from the Theory of Mind, implemented algorithmically using an attention mechanism and trained in an end-to-end manner. Crucial to determining the final actions of these agents, the weights in their attention model explicitly represent attention to different goals. We furthermore propose an inverse attention network that deduces the ToM of agents based on observations and prior actions. The network infers the attentional states of other agents, thereby refining the attention weights to adjust the agent's final action. We conduct experiments in a continuous environment, tackling demanding tasks encompassing cooperation, competition, and a blend of both. They demonstrate that the inverse attention network successfully infers the attention of other agents, and that this information improves agent performance. Additional human experiments show that, compared to baseline agent models, our inverse attention agents exhibit superior cooperation with humans and better emulate human behaviors. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21794v1-abstract-full').style.display = 'none'; document.getElementById('2410.21794v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20824">arXiv:2410.20824</a> <span> [<a href="https://arxiv.org/pdf/2410.20824">pdf</a>, <a href="https://arxiv.org/format/2410.20824">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> FreqMark: Invisible Image Watermarking via Frequency Based Optimization in Latent Space </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+Y">Yiyang Guo</a>, <a href="/search/cs?searchtype=author&query=Li%2C+R">Ruizhe Li</a>, <a href="/search/cs?searchtype=author&query=Hui%2C+M">Mude Hui</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+H">Hanzhong Guo</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chen Zhang</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+C">Chuangjian Cai</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+L">Le Wan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shangfei Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20824v1-abstract-short" style="display: inline;"> Invisible watermarking is essential for safeguarding digital content, enabling copyright protection and content authentication. However, existing watermarking methods fall short in robustness against regeneration attacks. In this paper, we propose a novel method called FreqMark that involves unconstrained optimization of the image latent frequency space obtained after VAE encoding. Specifically, F… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20824v1-abstract-full').style.display = 'inline'; document.getElementById('2410.20824v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20824v1-abstract-full" style="display: none;"> Invisible watermarking is essential for safeguarding digital content, enabling copyright protection and content authentication. However, existing watermarking methods fall short in robustness against regeneration attacks. In this paper, we propose a novel method called FreqMark that involves unconstrained optimization of the image latent frequency space obtained after VAE encoding. Specifically, FreqMark embeds the watermark by optimizing the latent frequency space of the images and then extracts the watermark through a pre-trained image encoder. This optimization allows a flexible trade-off between image quality with watermark robustness and effectively resists regeneration attacks. Experimental results demonstrate that FreqMark offers significant advantages in image quality and robustness, permits flexible selection of the encoding bit number, and achieves a bit accuracy exceeding 90% when encoding a 48-bit hidden message under various attack scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20824v1-abstract-full').style.display = 'none'; document.getElementById('2410.20824v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20711">arXiv:2410.20711</a> <span> [<a href="https://arxiv.org/pdf/2410.20711">pdf</a>, <a href="https://arxiv.org/format/2410.20711">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Biomolecules">q-bio.BM</span> </div> </div> <p class="title is-5 mathjax"> Contextual Representation Anchor Network to Alleviate Selection Bias in Few-Shot Drug Discovery </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+R">Ruifeng Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+W">Wei Liu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+X">Xiangxin Zhou</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Mingqian Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Qiang Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hongyang Chen</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+X">Xuemin Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20711v2-abstract-short" style="display: inline;"> In the drug discovery process, the low success rate of drug candidate screening often leads to insufficient labeled data, causing the few-shot learning problem in molecular property prediction. Existing methods for few-shot molecular property prediction overlook the sample selection bias, which arises from non-random sample selection in chemical experiments. This bias in data representativeness le… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20711v2-abstract-full').style.display = 'inline'; document.getElementById('2410.20711v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20711v2-abstract-full" style="display: none;"> In the drug discovery process, the low success rate of drug candidate screening often leads to insufficient labeled data, causing the few-shot learning problem in molecular property prediction. Existing methods for few-shot molecular property prediction overlook the sample selection bias, which arises from non-random sample selection in chemical experiments. This bias in data representativeness leads to suboptimal performance. To overcome this challenge, we present a novel method named contextual representation anchor Network (CRA), where an anchor refers to a cluster center of the representations of molecules and serves as a bridge to transfer enriched contextual knowledge into molecular representations and enhance their expressiveness. CRA introduces a dual-augmentation mechanism that includes context augmentation, which dynamically retrieves analogous unlabeled molecules and captures their task-specific contextual knowledge to enhance the anchors, and anchor augmentation, which leverages the anchors to augment the molecular representations. We evaluate our approach on the MoleculeNet and FS-Mol benchmarks, as well as in domain transfer experiments. The results demonstrate that CRA outperforms the state-of-the-art by 2.60% and 3.28% in AUC and $螖$AUC-PR metrics, respectively, and exhibits superior generalization capabilities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20711v2-abstract-full').style.display = 'none'; document.getElementById('2410.20711v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 7 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68U07 <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.1 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20389">arXiv:2410.20389</a> <span> [<a href="https://arxiv.org/pdf/2410.20389">pdf</a>, <a href="https://arxiv.org/format/2410.20389">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> Lodge++: High-quality and Long Dance Generation with Vivid Choreography Patterns </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+R">Ronghui Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hongwen Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yachao Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yuxiang Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Youliang Zhang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+J">Jie Guo</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yan Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiu Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yebin Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20389v1-abstract-short" style="display: inline;"> We propose Lodge++, a choreography framework to generate high-quality, ultra-long, and vivid dances given the music and desired genre. To handle the challenges in computational efficiency, the learning of complex and vivid global choreography patterns, and the physical quality of local dance movements, Lodge++ adopts a two-stage strategy to produce dances from coarse to fine. In the first stage, a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20389v1-abstract-full').style.display = 'inline'; document.getElementById('2410.20389v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20389v1-abstract-full" style="display: none;"> We propose Lodge++, a choreography framework to generate high-quality, ultra-long, and vivid dances given the music and desired genre. To handle the challenges in computational efficiency, the learning of complex and vivid global choreography patterns, and the physical quality of local dance movements, Lodge++ adopts a two-stage strategy to produce dances from coarse to fine. In the first stage, a global choreography network is designed to generate coarse-grained dance primitives that capture complex global choreography patterns. In the second stage, guided by these dance primitives, a primitive-based dance diffusion model is proposed to further generate high-quality, long-sequence dances in parallel, faithfully adhering to the complex choreography patterns. Additionally, to improve the physical plausibility, Lodge++ employs a penetration guidance module to resolve character self-penetration, a foot refinement module to optimize foot-ground contact, and a multi-genre discriminator to maintain genre consistency throughout the dance. Lodge++ is validated by extensive experiments, which show that our method can rapidly generate ultra-long dances suitable for various dance genres, ensuring well-organized global choreography patterns and high-quality local motion. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20389v1-abstract-full').style.display = 'none'; document.getElementById('2410.20389v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://li-ronghui.github.io/lodgepp</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.19744">arXiv:2410.19744</a> <span> [<a href="https://arxiv.org/pdf/2410.19744">pdf</a>, <a href="https://arxiv.org/format/2410.19744">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Towards Next-Generation LLM-based Recommender Systems: A Survey and Beyond </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qi Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jindong Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shiqi Wang</a>, <a href="/search/cs?searchtype=author&query=Xing%2C+Q">Qianli Xing</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+R">Runliang Niu</a>, <a href="/search/cs?searchtype=author&query=Kong%2C+H">He Kong</a>, <a href="/search/cs?searchtype=author&query=Li%2C+R">Rui Li</a>, <a href="/search/cs?searchtype=author&query=Long%2C+G">Guodong Long</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+Y">Yi Chang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chengqi Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.19744v1-abstract-short" style="display: inline;"> Large language models (LLMs) have not only revolutionized the field of natural language processing (NLP) but also have the potential to bring a paradigm shift in many other fields due to their remarkable abilities of language understanding, as well as impressive generalization capabilities and reasoning skills. As a result, recent studies have actively attempted to harness the power of LLMs to imp… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19744v1-abstract-full').style.display = 'inline'; document.getElementById('2410.19744v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.19744v1-abstract-full" style="display: none;"> Large language models (LLMs) have not only revolutionized the field of natural language processing (NLP) but also have the potential to bring a paradigm shift in many other fields due to their remarkable abilities of language understanding, as well as impressive generalization capabilities and reasoning skills. As a result, recent studies have actively attempted to harness the power of LLMs to improve recommender systems, and it is imperative to thoroughly review the recent advances and challenges of LLM-based recommender systems. Unlike existing work, this survey does not merely analyze the classifications of LLM-based recommendation systems according to the technical framework of LLMs. Instead, it investigates how LLMs can better serve recommendation tasks from the perspective of the recommender system community, thus enhancing the integration of large language models into the research of recommender system and its practical application. In addition, the long-standing gap between academic research and industrial applications related to recommender systems has not been well discussed, especially in the era of large language models. In this review, we introduce a novel taxonomy that originates from the intrinsic essence of recommendation, delving into the application of large language model-based recommendation systems and their industrial implementation. Specifically, we propose a three-tier structure that more accurately reflects the developmental progression of recommendation systems from research to practical implementation, including representing and understanding, scheming and utilizing, and industrial deployment. Furthermore, we discuss critical challenges and opportunities in this emerging field. A more up-to-date version of the papers is maintained at: https://github.com/jindongli-Ai/Next-Generation-LLM-based-Recommender-Systems-Survey. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19744v1-abstract-full').style.display = 'none'; document.getElementById('2410.19744v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.19485">arXiv:2410.19485</a> <span> [<a href="https://arxiv.org/pdf/2410.19485">pdf</a>, <a href="https://arxiv.org/format/2410.19485">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> A Debate-Driven Experiment on LLM Hallucinations and Accuracy </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+R">Ray Li</a>, <a href="/search/cs?searchtype=author&query=Bagade%2C+T">Tanishka Bagade</a>, <a href="/search/cs?searchtype=author&query=Martinez%2C+K">Kevin Martinez</a>, <a href="/search/cs?searchtype=author&query=Yasmin%2C+F">Flora Yasmin</a>, <a href="/search/cs?searchtype=author&query=Ayala%2C+G">Grant Ayala</a>, <a href="/search/cs?searchtype=author&query=Lam%2C+M">Michael Lam</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+K">Kevin Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.19485v1-abstract-short" style="display: inline;"> Large language models (LLMs) have achieved a degree of success in generating coherent and contextually relevant text, yet they remain prone to a significant challenge known as hallucination: producing information that is not substantiated by the input or external knowledge. Previous efforts to mitigate hallucinations have focused on techniques such as fine-tuning models on high-quality datasets, i… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19485v1-abstract-full').style.display = 'inline'; document.getElementById('2410.19485v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.19485v1-abstract-full" style="display: none;"> Large language models (LLMs) have achieved a degree of success in generating coherent and contextually relevant text, yet they remain prone to a significant challenge known as hallucination: producing information that is not substantiated by the input or external knowledge. Previous efforts to mitigate hallucinations have focused on techniques such as fine-tuning models on high-quality datasets, incorporating fact-checking mechanisms, and developing adversarial training methods. While these approaches have shown some promise, they often address the issue at the level of individual model outputs, leaving unexplored the effects of inter-model interactions on hallucination. This study investigates the phenomenon of hallucination in LLMs through a novel experimental framework where multiple instances of GPT-4o-Mini models engage in a debate-like interaction prompted with questions from the TruthfulQA dataset. One model is deliberately instructed to generate plausible but false answers while the other models are asked to respond truthfully. The experiment is designed to assess whether the introduction of misinformation by one model can challenge the truthful majority to better justify their reasoning, improving performance on the TruthfulQA benchmark. The findings suggest that inter-model interactions can offer valuable insights into improving the accuracy and robustness of LLM outputs, complementing existing mitigation strategies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19485v1-abstract-full').style.display = 'none'; document.getElementById('2410.19485v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.19390">arXiv:2410.19390</a> <span> [<a href="https://arxiv.org/pdf/2410.19390">pdf</a>, <a href="https://arxiv.org/format/2410.19390">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Instrumentation and Methods for Astrophysics">astro-ph.IM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cosmology and Nongalactic Astrophysics">astro-ph.CO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1051/0004-6361/202349113">10.1051/0004-6361/202349113 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> CLAP. I. Resolving miscalibration for deep learning-based galaxy photometric redshift estimation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+Q">Qiufan Lin</a>, <a href="/search/cs?searchtype=author&query=Ruan%2C+H">Hengxin Ruan</a>, <a href="/search/cs?searchtype=author&query=Fouchez%2C+D">Dominique Fouchez</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Shupei Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+R">Rui Li</a>, <a href="/search/cs?searchtype=author&query=Montero-Camacho%2C+P">Paulo Montero-Camacho</a>, <a href="/search/cs?searchtype=author&query=Napolitano%2C+N+R">Nicola R. Napolitano</a>, <a href="/search/cs?searchtype=author&query=Ting%2C+Y">Yuan-Sen Ting</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wei Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.19390v1-abstract-short" style="display: inline;"> Obtaining well-calibrated photometric redshift probability densities for galaxies without a spectroscopic measurement remains a challenge. Deep learning discriminative models, typically fed with multi-band galaxy images, can produce outputs that mimic probability densities and achieve state-of-the-art accuracy. However, such models may be affected by miscalibration that would result in discrepanci… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19390v1-abstract-full').style.display = 'inline'; document.getElementById('2410.19390v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.19390v1-abstract-full" style="display: none;"> Obtaining well-calibrated photometric redshift probability densities for galaxies without a spectroscopic measurement remains a challenge. Deep learning discriminative models, typically fed with multi-band galaxy images, can produce outputs that mimic probability densities and achieve state-of-the-art accuracy. However, such models may be affected by miscalibration that would result in discrepancies between the model outputs and the actual distributions of true redshifts. Our work develops a novel method called the Contrastive Learning and Adaptive KNN for Photometric Redshift (CLAP) that resolves this issue. It leverages supervised contrastive learning (SCL) and k-nearest neighbours (KNN) to construct and calibrate raw probability density estimates, and implements a refitting procedure to resume end-to-end discriminative models ready to produce final estimates for large-scale imaging data. The harmonic mean is adopted to combine an ensemble of estimates from multiple realisations for improving accuracy. Our experiments demonstrate that CLAP takes advantage of both deep learning and KNN, outperforming benchmark methods on the calibration of probability density estimates and retaining high accuracy and computational efficiency. With reference to CLAP, we point out that miscalibration is particularly sensitive to the method-induced excessive correlations among data instances in addition to the unaccounted-for epistemic uncertainties. Reducing the uncertainties may not guarantee the removal of miscalibration due to the presence of such excessive correlations, yet this is a problem for conventional deep learning methods rather than CLAP. These discussions underscore the robustness of CLAP for obtaining photometric redshift probability densities required by astrophysical and cosmological applications. This is the first paper in our series on CLAP. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19390v1-abstract-full').style.display = 'none'; document.getElementById('2410.19390v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">22 + 6 pages, 9 + 5 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> A&A 691, A331 (2024) </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18956">arXiv:2410.18956</a> <span> [<a href="https://arxiv.org/pdf/2410.18956">pdf</a>, <a href="https://arxiv.org/format/2410.18956">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Large Spatial Model: End-to-end Unposed Images to Semantic 3D </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fan%2C+Z">Zhiwen Fan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jian Zhang</a>, <a href="/search/cs?searchtype=author&query=Cong%2C+W">Wenyan Cong</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+P">Peihao Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+R">Renjie Li</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+K">Kairun Wen</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+S">Shijie Zhou</a>, <a href="/search/cs?searchtype=author&query=Kadambi%2C+A">Achuta Kadambi</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhangyang Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+D">Danfei Xu</a>, <a href="/search/cs?searchtype=author&query=Ivanovic%2C+B">Boris Ivanovic</a>, <a href="/search/cs?searchtype=author&query=Pavone%2C+M">Marco Pavone</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yue Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18956v2-abstract-short" style="display: inline;"> Reconstructing and understanding 3D structures from a limited number of images is a well-established problem in computer vision. Traditional methods usually break this task into multiple subtasks, each requiring complex transformations between different data representations. For instance, dense reconstruction through Structure-from-Motion (SfM) involves converting images into key points, optimizin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18956v2-abstract-full').style.display = 'inline'; document.getElementById('2410.18956v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18956v2-abstract-full" style="display: none;"> Reconstructing and understanding 3D structures from a limited number of images is a well-established problem in computer vision. Traditional methods usually break this task into multiple subtasks, each requiring complex transformations between different data representations. For instance, dense reconstruction through Structure-from-Motion (SfM) involves converting images into key points, optimizing camera parameters, and estimating structures. Afterward, accurate sparse reconstructions are required for further dense modeling, which is subsequently fed into task-specific neural networks. This multi-step process results in considerable processing time and increased engineering complexity. In this work, we present the Large Spatial Model (LSM), which processes unposed RGB images directly into semantic radiance fields. LSM simultaneously estimates geometry, appearance, and semantics in a single feed-forward operation, and it can generate versatile label maps by interacting with language at novel viewpoints. Leveraging a Transformer-based architecture, LSM integrates global geometry through pixel-aligned point maps. To enhance spatial attribute regression, we incorporate local context aggregation with multi-scale fusion, improving the accuracy of fine local details. To tackle the scarcity of labeled 3D semantic data and enable natural language-driven scene manipulation, we incorporate a pre-trained 2D language-based segmentation model into a 3D-consistent semantic feature field. An efficient decoder then parameterizes a set of semantic anisotropic Gaussians, facilitating supervised end-to-end learning. Extensive experiments across various tasks show that LSM unifies multiple 3D vision tasks directly from unposed images, achieving real-time semantic 3D reconstruction for the first time. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18956v2-abstract-full').style.display = 'none'; document.getElementById('2410.18956v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project Website: https://largespatialmodel.github.io</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18410">arXiv:2410.18410</a> <span> [<a href="https://arxiv.org/pdf/2410.18410">pdf</a>, <a href="https://arxiv.org/format/2410.18410">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> FreCaS: Efficient Higher-Resolution Image Generation via Frequency-aware Cascaded Sampling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhengqiang Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+R">Ruihuang Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Lei Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18410v1-abstract-short" style="display: inline;"> While image generation with diffusion models has achieved a great success, generating images of higher resolution than the training size remains a challenging task due to the high computational cost. Current methods typically perform the entire sampling process at full resolution and process all frequency components simultaneously, contradicting with the inherent coarse-to-fine nature of latent di… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18410v1-abstract-full').style.display = 'inline'; document.getElementById('2410.18410v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18410v1-abstract-full" style="display: none;"> While image generation with diffusion models has achieved a great success, generating images of higher resolution than the training size remains a challenging task due to the high computational cost. Current methods typically perform the entire sampling process at full resolution and process all frequency components simultaneously, contradicting with the inherent coarse-to-fine nature of latent diffusion models and wasting computations on processing premature high-frequency details at early diffusion stages. To address this issue, we introduce an efficient $\textbf{Fre}$quency-aware $\textbf{Ca}$scaded $\textbf{S}$ampling framework, $\textbf{FreCaS}$ in short, for higher-resolution image generation. FreCaS decomposes the sampling process into cascaded stages with gradually increased resolutions, progressively expanding frequency bands and refining the corresponding details. We propose an innovative frequency-aware classifier-free guidance (FA-CFG) strategy to assign different guidance strengths for different frequency components, directing the diffusion model to add new details in the expanded frequency domain of each stage. Additionally, we fuse the cross-attention maps of previous and current stages to avoid synthesizing unfaithful layouts. Experiments demonstrate that FreCaS significantly outperforms state-of-the-art methods in image quality and generation speed. In particular, FreCaS is about 2.86$\times$ and 6.07$\times$ faster than ScaleCrafter and DemoFusion in generating a 2048$\times$2048 image using a pre-trained SDXL model and achieves an FID$_b$ improvement of 11.6 and 3.7, respectively. FreCaS can be easily extended to more complex models such as SD3. The source code of FreCaS can be found at $\href{\text{https://github.com/xtudbxk/FreCaS}}{https://github.com/xtudbxk/FreCaS}$. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18410v1-abstract-full').style.display = 'none'; document.getElementById('2410.18410v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.17031">arXiv:2410.17031</a> <span> [<a href="https://arxiv.org/pdf/2410.17031">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> GeoCode-GPT: A Large Language Model for Geospatial Code Generation Tasks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hou%2C+S">Shuyang Hou</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+Z">Zhangxiao Shen</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+A">Anqi Zhao</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+J">Jianyuan Liang</a>, <a href="/search/cs?searchtype=author&query=Gui%2C+Z">Zhipeng Gui</a>, <a href="/search/cs?searchtype=author&query=Guan%2C+X">Xuefeng Guan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+R">Rui Li</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+H">Huayi Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.17031v2-abstract-short" style="display: inline;"> The increasing demand for spatiotemporal data and modeling tasks in geosciences has made geospatial code generation technology a critical factor in enhancing productivity. Although large language models (LLMs) have demonstrated potential in code generation tasks, they often encounter issues such as refusal to code or hallucination in geospatial code generation due to a lack of domain-specific know… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17031v2-abstract-full').style.display = 'inline'; document.getElementById('2410.17031v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.17031v2-abstract-full" style="display: none;"> The increasing demand for spatiotemporal data and modeling tasks in geosciences has made geospatial code generation technology a critical factor in enhancing productivity. Although large language models (LLMs) have demonstrated potential in code generation tasks, they often encounter issues such as refusal to code or hallucination in geospatial code generation due to a lack of domain-specific knowledge and code corpora. To address these challenges, this paper presents and open-sources the GeoCode-PT and GeoCode-SFT corpora, along with the GeoCode-Eval evaluation dataset. Additionally, by leveraging QLoRA and LoRA for pretraining and fine-tuning, we introduce GeoCode-GPT-7B, the first LLM focused on geospatial code generation, fine-tuned from Code Llama-7B. Furthermore, we establish a comprehensive geospatial code evaluation framework, incorporating option matching, expert validation, and prompt engineering scoring for LLMs, and systematically evaluate GeoCode-GPT-7B using the GeoCode-Eval dataset. Experimental results show that GeoCode-GPT outperforms other models in multiple-choice accuracy by 9.1% to 32.1%, in code summarization ability by 1.7% to 25.4%, and in code generation capability by 1.2% to 25.1%. This paper provides a solution and empirical validation for enhancing LLMs' performance in geospatial code generation, extends the boundaries of domain-specific model applications, and offers valuable insights into unlocking their potential in geospatial code generation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17031v2-abstract-full').style.display = 'none'; document.getElementById('2410.17031v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.16708">arXiv:2410.16708</a> <span> [<a href="https://arxiv.org/pdf/2410.16708">pdf</a>, <a href="https://arxiv.org/format/2410.16708">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Atomic Fact Decomposition Helps Attributed Question Answering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yan%2C+Z">Zhichao Yan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jiapu Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jiaoyan Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiaoli Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+R">Ru Li</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+J+Z">Jeff Z. Pan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.16708v1-abstract-short" style="display: inline;"> Attributed Question Answering (AQA) aims to provide both a trustworthy answer and a reliable attribution report for a given question. Retrieval is a widely adopted approach, including two general paradigms: Retrieval-Then-Read (RTR) and post-hoc retrieval. Recently, Large Language Models (LLMs) have shown remarkable proficiency, prompting growing interest in AQA among researchers. However, RTR-bas… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16708v1-abstract-full').style.display = 'inline'; document.getElementById('2410.16708v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.16708v1-abstract-full" style="display: none;"> Attributed Question Answering (AQA) aims to provide both a trustworthy answer and a reliable attribution report for a given question. Retrieval is a widely adopted approach, including two general paradigms: Retrieval-Then-Read (RTR) and post-hoc retrieval. Recently, Large Language Models (LLMs) have shown remarkable proficiency, prompting growing interest in AQA among researchers. However, RTR-based AQA often suffers from irrelevant knowledge and rapidly changing information, even when LLMs are adopted, while post-hoc retrieval-based AQA struggles with comprehending long-form answers with complex logic, and precisely identifying the content needing revision and preserving the original intent. To tackle these problems, this paper proposes an Atomic fact decomposition-based Retrieval and Editing (ARE) framework, which decomposes the generated long-form answers into molecular clauses and atomic facts by the instruction-tuned LLMs. Notably, the instruction-tuned LLMs are fine-tuned using a well-constructed dataset, generated from large scale Knowledge Graphs (KGs). This process involves extracting one-hop neighbors from a given set of entities and transforming the result into coherent long-form text. Subsequently, ARE leverages a search engine to retrieve evidences related to atomic facts, inputting these evidences into an LLM-based verifier to determine whether the facts require expansion for re-retrieval or editing. Furthermore, the edited facts are backtracked into the original answer, with evidence aggregated based on the relationship between molecular clauses and atomic facts. Extensive evaluations demonstrate the superior performance of our proposed method over the state-of-the-arts on several datasets, with an additionally proposed new metric $Attr_{p}$ for evaluating the precision of evidence attribution. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16708v1-abstract-full').style.display = 'none'; document.getElementById('2410.16708v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.16232">arXiv:2410.16232</a> <span> [<a href="https://arxiv.org/pdf/2410.16232">pdf</a>, <a href="https://arxiv.org/format/2410.16232">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Sketch2Code: Evaluating Vision-Language Models for Interactive Web Design Prototyping </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+R">Ryan Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yanzhe Zhang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+D">Diyi Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.16232v1-abstract-short" style="display: inline;"> Sketches are a natural and accessible medium for UI designers to conceptualize early-stage ideas. However, existing research on UI/UX automation often requires high-fidelity inputs like Figma designs or detailed screenshots, limiting accessibility and impeding efficient design iteration. To bridge this gap, we introduce Sketch2Code, a benchmark that evaluates state-of-the-art Vision Language Model… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16232v1-abstract-full').style.display = 'inline'; document.getElementById('2410.16232v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.16232v1-abstract-full" style="display: none;"> Sketches are a natural and accessible medium for UI designers to conceptualize early-stage ideas. However, existing research on UI/UX automation often requires high-fidelity inputs like Figma designs or detailed screenshots, limiting accessibility and impeding efficient design iteration. To bridge this gap, we introduce Sketch2Code, a benchmark that evaluates state-of-the-art Vision Language Models (VLMs) on automating the conversion of rudimentary sketches into webpage prototypes. Beyond end-to-end benchmarking, Sketch2Code supports interactive agent evaluation that mimics real-world design workflows, where a VLM-based agent iteratively refines its generations by communicating with a simulated user, either passively receiving feedback instructions or proactively asking clarification questions. We comprehensively analyze ten commercial and open-source models, showing that Sketch2Code is challenging for existing VLMs; even the most capable models struggle to accurately interpret sketches and formulate effective questions that lead to steady improvement. Nevertheless, a user study with UI/UX experts reveals a significant preference for proactive question-asking over passive feedback reception, highlighting the need to develop more effective paradigms for multi-turn conversational agents. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16232v1-abstract-full').style.display = 'none'; document.getElementById('2410.16232v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">preprint, 9 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.15762">arXiv:2410.15762</a> <span> [<a href="https://arxiv.org/pdf/2410.15762">pdf</a>, <a href="https://arxiv.org/format/2410.15762">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Solving Sparse \& High-Dimensional-Output Regression via Compression </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+R">Renyuan Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhehui Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guanyi Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.15762v1-abstract-short" style="display: inline;"> Multi-Output Regression (MOR) has been widely used in scientific data analysis for decision-making. Unlike traditional regression models, MOR aims to simultaneously predict multiple real-valued outputs given an input. However, the increasing dimensionality of the outputs poses significant challenges regarding interpretability and computational scalability for modern MOR applications. As a first st… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15762v1-abstract-full').style.display = 'inline'; document.getElementById('2410.15762v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.15762v1-abstract-full" style="display: none;"> Multi-Output Regression (MOR) has been widely used in scientific data analysis for decision-making. Unlike traditional regression models, MOR aims to simultaneously predict multiple real-valued outputs given an input. However, the increasing dimensionality of the outputs poses significant challenges regarding interpretability and computational scalability for modern MOR applications. As a first step to address these challenges, this paper proposes a Sparse \& High-dimensional-Output REgression (SHORE) model by incorporating additional sparsity requirements to resolve the output interpretability, and then designs a computationally efficient two-stage optimization framework capable of solving SHORE with provable accuracy via compression on outputs. Theoretically, we show that the proposed framework is computationally scalable while maintaining the same order of training loss and prediction loss before-and-after compression under arbitrary or relatively weak sample set conditions. Empirically, numerical results further validate the theoretical findings, showcasing the efficiency and accuracy of the proposed framework. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15762v1-abstract-full').style.display = 'none'; document.getElementById('2410.15762v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Admitted in Neurips 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.15616">arXiv:2410.15616</a> <span> [<a href="https://arxiv.org/pdf/2410.15616">pdf</a>, <a href="https://arxiv.org/format/2410.15616">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Weighted Diversified Sampling for Efficient Data-Driven Single-Cell Gene-Gene Interaction Discovery </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yifan Wu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yuntao Yang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zirui Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhao Li</a>, <a href="/search/cs?searchtype=author&query=Pahwa%2C+K">Khushbu Pahwa</a>, <a href="/search/cs?searchtype=author&query=Li%2C+R">Rongbin Li</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+W">Wenjin Zheng</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+X">Xia Hu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zhaozhuo Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.15616v1-abstract-short" style="display: inline;"> Gene-gene interactions play a crucial role in the manifestation of complex human diseases. Uncovering significant gene-gene interactions is a challenging task. Here, we present an innovative approach utilizing data-driven computational tools, leveraging an advanced Transformer model, to unearth noteworthy gene-gene interactions. Despite the efficacy of Transformer models, their parameter intensity… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15616v1-abstract-full').style.display = 'inline'; document.getElementById('2410.15616v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.15616v1-abstract-full" style="display: none;"> Gene-gene interactions play a crucial role in the manifestation of complex human diseases. Uncovering significant gene-gene interactions is a challenging task. Here, we present an innovative approach utilizing data-driven computational tools, leveraging an advanced Transformer model, to unearth noteworthy gene-gene interactions. Despite the efficacy of Transformer models, their parameter intensity presents a bottleneck in data ingestion, hindering data efficiency. To mitigate this, we introduce a novel weighted diversified sampling algorithm. This algorithm computes the diversity score of each data sample in just two passes of the dataset, facilitating efficient subset generation for interaction discovery. Our extensive experimentation demonstrates that by sampling a mere 1\% of the single-cell dataset, we achieve performance comparable to that of utilizing the entire dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15616v1-abstract-full').style.display = 'none'; document.getElementById('2410.15616v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.14966">arXiv:2410.14966</a> <span> [<a href="https://arxiv.org/pdf/2410.14966">pdf</a>, <a href="https://arxiv.org/format/2410.14966">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Attack as Defense: Run-time Backdoor Implantation for Image Content Protection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Haichuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+M">Meiyu Lin</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhaoyi Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+R">Renyuan Li</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+Z">Zhiyuan Cheng</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+C">Carl Yang</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+M">Mingjie Tang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.14966v1-abstract-short" style="display: inline;"> As generative models achieve great success, tampering and modifying the sensitive image contents (i.e., human faces, artist signatures, commercial logos, etc.) have induced a significant threat with social impact. The backdoor attack is a method that implants vulnerabilities in a target model, which can be activated through a trigger. In this work, we innovatively prevent the abuse of image conten… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14966v1-abstract-full').style.display = 'inline'; document.getElementById('2410.14966v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.14966v1-abstract-full" style="display: none;"> As generative models achieve great success, tampering and modifying the sensitive image contents (i.e., human faces, artist signatures, commercial logos, etc.) have induced a significant threat with social impact. The backdoor attack is a method that implants vulnerabilities in a target model, which can be activated through a trigger. In this work, we innovatively prevent the abuse of image content modification by implanting the backdoor into image-editing models. Once the protected sensitive content on an image is modified by an editing model, the backdoor will be triggered, making the editing fail. Unlike traditional backdoor attacks that use data poisoning, to enable protection on individual images and eliminate the need for model training, we developed the first framework for run-time backdoor implantation, which is both time- and resource- efficient. We generate imperceptible perturbations on the images to inject the backdoor and define the protected area as the only backdoor trigger. Editing other unprotected insensitive areas will not trigger the backdoor, which minimizes the negative impact on legal image modifications. Evaluations with state-of-the-art image editing models show that our protective method can increase the CLIP-FID of generated images from 12.72 to 39.91, or reduce the SSIM from 0.503 to 0.167 when subjected to malicious editing. At the same time, our method exhibits minimal impact on benign editing, which demonstrates the efficacy of our proposed framework. The proposed run-time backdoor can also achieve effective protection on the latest diffusion models. Code are available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14966v1-abstract-full').style.display = 'none'; document.getElementById('2410.14966v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.14380">arXiv:2410.14380</a> <span> [<a href="https://arxiv.org/pdf/2410.14380">pdf</a>, <a href="https://arxiv.org/format/2410.14380">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Dual-Label Learning With Irregularly Present Labels </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+M">Mingqian Li</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Q">Qiao Han</a>, <a href="/search/cs?searchtype=author&query=Zhai%2C+Y">Yiteng Zhai</a>, <a href="/search/cs?searchtype=author&query=Li%2C+R">Ruifeng Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yao Yang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hongyang Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.14380v2-abstract-short" style="display: inline;"> In multi-task learning, we often encounter the case when the presence of labels across samples exhibits irregular patterns: samples can be fully labeled, partially labeled or unlabeled. Taking drug analysis as an example, multiple toxicity properties of a drug molecule may not be concurrently available due to experimental limitations. It triggers a demand for a new training and inference mechanism… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14380v2-abstract-full').style.display = 'inline'; document.getElementById('2410.14380v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.14380v2-abstract-full" style="display: none;"> In multi-task learning, we often encounter the case when the presence of labels across samples exhibits irregular patterns: samples can be fully labeled, partially labeled or unlabeled. Taking drug analysis as an example, multiple toxicity properties of a drug molecule may not be concurrently available due to experimental limitations. It triggers a demand for a new training and inference mechanism that could accommodate irregularly present labels and maximize the utility of any available label information. In this work, we focus on the two-label learning task, and propose a novel training and inference framework, Dual-Label Learning (DLL). The DLL framework formulates the problem into a dual-function system, in which the two functions should simultaneously satisfy standard supervision, structural duality and probabilistic duality. DLL features a dual-tower model architecture that explicitly captures the information exchange between labels, aimed at maximizing the utility of partially available labels in understanding label correlation. During training, label imputation for missing labels is conducted as part of the forward propagation process, while during inference, labels are regarded as unknowns of a bivariate system of equations and are solved jointly. Theoretical analysis guarantees the feasibility of DLL, and extensive experiments are conducted to verify that by explicitly modeling label correlation and maximizing the utility of available labels, our method makes consistently better predictions than baseline approaches by up to a 10% gain in F1-score or MAPE. Remarkably, our method provided with data at a label missing rate as high as 60% can achieve similar or even better results than baseline approaches at a label missing rate of only 10%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14380v2-abstract-full').style.display = 'none'; document.getElementById('2410.14380v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.14237">arXiv:2410.14237</a> <span> [<a href="https://arxiv.org/pdf/2410.14237">pdf</a>, <a href="https://arxiv.org/ps/2410.14237">ps</a>, <a href="https://arxiv.org/format/2410.14237">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Unified Convergence Analysis for Score-Based Diffusion Models with Deterministic Samplers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+R">Runjia Li</a>, <a href="/search/cs?searchtype=author&query=Di%2C+Q">Qiwei Di</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+Q">Quanquan Gu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.14237v1-abstract-short" style="display: inline;"> Score-based diffusion models have emerged as powerful techniques for generating samples from high-dimensional data distributions. These models involve a two-phase process: first, injecting noise to transform the data distribution into a known prior distribution, and second, sampling to recover the original data distribution from noise. Among the various sampling methods, deterministic samplers sta… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14237v1-abstract-full').style.display = 'inline'; document.getElementById('2410.14237v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.14237v1-abstract-full" style="display: none;"> Score-based diffusion models have emerged as powerful techniques for generating samples from high-dimensional data distributions. These models involve a two-phase process: first, injecting noise to transform the data distribution into a known prior distribution, and second, sampling to recover the original data distribution from noise. Among the various sampling methods, deterministic samplers stand out for their enhanced efficiency. However, analyzing these deterministic samplers presents unique challenges, as they preclude the use of established techniques such as Girsanov's theorem, which are only applicable to stochastic samplers. Furthermore, existing analysis for deterministic samplers usually focuses on specific examples, lacking a generalized approach for general forward processes and various deterministic samplers. Our paper addresses these limitations by introducing a unified convergence analysis framework. To demonstrate the power of our framework, we analyze the variance-preserving (VP) forward process with the exponential integrator (EI) scheme, achieving iteration complexity of $\tilde O(d^2/蔚)$. Additionally, we provide a detailed analysis of Denoising Diffusion Implicit Models (DDIM)-type samplers, which have been underexplored in previous research, achieving polynomial iteration complexity. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14237v1-abstract-full').style.display = 'none'; document.getElementById('2410.14237v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">68 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.13567">arXiv:2410.13567</a> <span> [<a href="https://arxiv.org/pdf/2410.13567">pdf</a>, <a href="https://arxiv.org/format/2410.13567">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> CCUP: A Controllable Synthetic Data Generation Pipeline for Pretraining Cloth-Changing Person Re-Identification Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yujian Zhao</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+C">Chengru Wu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yinong Xu</a>, <a href="/search/cs?searchtype=author&query=Du%2C+X">Xuanzheng Du</a>, <a href="/search/cs?searchtype=author&query=Li%2C+R">Ruiyu Li</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+G">Guanglin Niu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.13567v1-abstract-short" style="display: inline;"> Cloth-changing person re-identification (CC-ReID), also known as Long-Term Person Re-Identification (LT-ReID) is a critical and challenging research topic in computer vision that has recently garnered significant attention. However, due to the high cost of constructing CC-ReID data, the existing data-driven models are hard to train efficiently on limited data, causing overfitting issue. To address… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13567v1-abstract-full').style.display = 'inline'; document.getElementById('2410.13567v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.13567v1-abstract-full" style="display: none;"> Cloth-changing person re-identification (CC-ReID), also known as Long-Term Person Re-Identification (LT-ReID) is a critical and challenging research topic in computer vision that has recently garnered significant attention. However, due to the high cost of constructing CC-ReID data, the existing data-driven models are hard to train efficiently on limited data, causing overfitting issue. To address this challenge, we propose a low-cost and efficient pipeline for generating controllable and high-quality synthetic data simulating the surveillance of real scenarios specific to the CC-ReID task. Particularly, we construct a new self-annotated CC-ReID dataset named Cloth-Changing Unreal Person (CCUP), containing 6,000 IDs, 1,179,976 images, 100 cameras, and 26.5 outfits per individual. Based on this large-scale dataset, we introduce an effective and scalable pretrain-finetune framework for enhancing the generalization capabilities of the traditional CC-ReID models. The extensive experiments demonstrate that two typical models namely TransReID and FIRe^2, when integrated into our framework, outperform other state-of-the-art models after pretraining on CCUP and finetuning on the benchmarks such as PRCC, VC-Clothes and NKUP. The CCUP is available at: https://github.com/yjzhao1019/CCUP. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13567v1-abstract-full').style.display = 'none'; document.getElementById('2410.13567v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.12957">arXiv:2410.12957</a> <span> [<a href="https://arxiv.org/pdf/2410.12957">pdf</a>, <a href="https://arxiv.org/format/2410.12957">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> MuVi: Video-to-Music Generation with Semantic Alignment and Rhythmic Synchronization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+R">Ruiqi Li</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+S">Siqi Zheng</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+X">Xize Cheng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Ziang Zhang</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+S">Shengpeng Ji</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Zhou Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.12957v1-abstract-short" style="display: inline;"> Generating music that aligns with the visual content of a video has been a challenging task, as it requires a deep understanding of visual semantics and involves generating music whose melody, rhythm, and dynamics harmonize with the visual narratives. This paper presents MuVi, a novel framework that effectively addresses these challenges to enhance the cohesion and immersive experience of audio-vi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12957v1-abstract-full').style.display = 'inline'; document.getElementById('2410.12957v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.12957v1-abstract-full" style="display: none;"> Generating music that aligns with the visual content of a video has been a challenging task, as it requires a deep understanding of visual semantics and involves generating music whose melody, rhythm, and dynamics harmonize with the visual narratives. This paper presents MuVi, a novel framework that effectively addresses these challenges to enhance the cohesion and immersive experience of audio-visual content. MuVi analyzes video content through a specially designed visual adaptor to extract contextually and temporally relevant features. These features are used to generate music that not only matches the video's mood and theme but also its rhythm and pacing. We also introduce a contrastive music-visual pre-training scheme to ensure synchronization, based on the periodicity nature of music phrases. In addition, we demonstrate that our flow-matching-based music generator has in-context learning ability, allowing us to control the style and genre of the generated music. Experimental results show that MuVi demonstrates superior performance in both audio quality and temporal synchronization. The generated music video samples are available at https://muvi-v2m.github.io. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12957v1-abstract-full').style.display = 'none'; document.getElementById('2410.12957v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Working in progress</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Li%2C+R&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Li%2C+R&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Li%2C+R&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Li%2C+R&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Li%2C+R&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Li%2C+R&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>