Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 281 results for author: <span class="mathjax">Peng, C</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Peng%2C+C">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Peng, C"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Peng%2C+C&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Peng, C"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Peng%2C+C&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Peng%2C+C&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Peng%2C+C&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Peng%2C+C&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Peng%2C+C&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Peng%2C+C&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li> <a href="/search/?searchtype=author&query=Peng%2C+C&start=250" class="pagination-link " aria-label="Page 6" aria-current="page">6 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.00623">arXiv:2412.00623</a> <span> [<a href="https://arxiv.org/pdf/2412.00623">pdf</a>, <a href="https://arxiv.org/format/2412.00623">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> A Lesson in Splats: Teacher-Guided Diffusion for 3D Gaussian Splats Generation with 2D Supervision </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Peng%2C+C">Chensheng Peng</a>, <a href="/search/cs?searchtype=author&query=Sobol%2C+I">Ido Sobol</a>, <a href="/search/cs?searchtype=author&query=Tomizuka%2C+M">Masayoshi Tomizuka</a>, <a href="/search/cs?searchtype=author&query=Keutzer%2C+K">Kurt Keutzer</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+C">Chenfeng Xu</a>, <a href="/search/cs?searchtype=author&query=Litany%2C+O">Or Litany</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.00623v1-abstract-short" style="display: inline;"> We introduce a diffusion model for Gaussian Splats, SplatDiffusion, to enable generation of three-dimensional structures from single images, addressing the ill-posed nature of lifting 2D inputs to 3D. Existing methods rely on deterministic, feed-forward predictions, which limit their ability to handle the inherent ambiguity of 3D inference from 2D data. Diffusion models have recently shown promise… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.00623v1-abstract-full').style.display = 'inline'; document.getElementById('2412.00623v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.00623v1-abstract-full" style="display: none;"> We introduce a diffusion model for Gaussian Splats, SplatDiffusion, to enable generation of three-dimensional structures from single images, addressing the ill-posed nature of lifting 2D inputs to 3D. Existing methods rely on deterministic, feed-forward predictions, which limit their ability to handle the inherent ambiguity of 3D inference from 2D data. Diffusion models have recently shown promise as powerful generative models for 3D data, including Gaussian splats; however, standard diffusion frameworks typically require the target signal and denoised signal to be in the same modality, which is challenging given the scarcity of 3D data. To overcome this, we propose a novel training strategy that decouples the denoised modality from the supervision modality. By using a deterministic model as a noisy teacher to create the noised signal and transitioning from single-step to multi-step denoising supervised by an image rendering loss, our approach significantly enhances performance compared to the deterministic teacher. Additionally, our method is flexible, as it can learn from various 3D Gaussian Splat (3DGS) teachers with minimal adaptation; we demonstrate this by surpassing the performance of two different deterministic models as teachers, highlighting the potential generalizability of our framework. Our approach further incorporates a guidance mechanism to aggregate information from multiple views, enhancing reconstruction quality when more than one view is available. Experimental results on object-level and scene-level datasets demonstrate the effectiveness of our framework. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.00623v1-abstract-full').style.display = 'none'; document.getElementById('2412.00623v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.18019">arXiv:2411.18019</a> <span> [<a href="https://arxiv.org/pdf/2411.18019">pdf</a>, <a href="https://arxiv.org/format/2411.18019">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> A Real-World Benchmark for Evaluating Fine-Grained Issue Solving Capabilities of Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+R">Ruida Hu</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+C">Chao Peng</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+J">Jingyi Ren</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+B">Bo Jiang</a>, <a href="/search/cs?searchtype=author&query=Meng%2C+X">Xiangxin Meng</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Q">Qinyun Wu</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+P">Pengfei Gao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xinchen Wang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+C">Cuiyun Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.18019v1-abstract-short" style="display: inline;"> Automatically resolving software issues is crucial for software development in practice, impacting the software quality and user experience. The process of resolving real-world issues encompasses tasks such as question-answering (QA), fault localization, and code editing. Existing benchmarks such as HumanEval fall short in their ability to assess LLMs' proficiency in solving issues within a codeba… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18019v1-abstract-full').style.display = 'inline'; document.getElementById('2411.18019v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.18019v1-abstract-full" style="display: none;"> Automatically resolving software issues is crucial for software development in practice, impacting the software quality and user experience. The process of resolving real-world issues encompasses tasks such as question-answering (QA), fault localization, and code editing. Existing benchmarks such as HumanEval fall short in their ability to assess LLMs' proficiency in solving issues within a codebase. Although benchmarks like SWE-Bench are designed to evaluate the LLMs' capability to handle real-world GitHub issues, the end-to-end evaluation method cannot provide granular insights on the performance of subtasks involved in issue solving. To address existing deficiencies in benchmarking LLMs for practical software engineering tasks, we introduce FAUN-Eval, a benchmark specifically designed to evaluate the Fine-grAined issUe solviNg capabilities of LLMs. FAUN-Eval systematically assesses LLMs across three distinct tasks: QA, fault localization, and code editing. This benchmark is constructed using a dataset curated from 30 well-known GitHub repositories. For each entry, issue and pull request (PR) pairs are meticulously compiled and validated using cross-referencing and keyword verification methods. FAUN-Eval includes 300 entries and employs both LLM and manual checks to ensure data quality. We evaluate ten LLMs with FAUN-Eval, including four closed-source and six open-source models. Our experimental results reveal several key findings. We find that the top-performing LLMs differ across the different tasks. Additionally, features in issues may lead LLMs to generate incorrect information. Moreover, models may vary in their proficiency with texts of different lengths. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18019v1-abstract-full').style.display = 'none'; document.getElementById('2411.18019v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.18015">arXiv:2411.18015</a> <span> [<a href="https://arxiv.org/pdf/2411.18015">pdf</a>, <a href="https://arxiv.org/format/2411.18015">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> AEGIS: An Agent-based Framework for General Bug Reproduction from Issue Descriptions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xinchen Wang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+P">Pengfei Gao</a>, <a href="/search/cs?searchtype=author&query=Meng%2C+X">Xiangxin Meng</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+C">Chao Peng</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+R">Ruida Hu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yun Lin</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+C">Cuiyun Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.18015v1-abstract-short" style="display: inline;"> In software maintenance, bug reproduction is essential for effective fault localization and repair. Manually writing reproduction scripts is a time-consuming task with high requirements for developers. Hence, automation of bug reproduction has increasingly attracted attention from researchers and practitioners. However, the existing studies on bug reproduction are generally limited to specific bug… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18015v1-abstract-full').style.display = 'inline'; document.getElementById('2411.18015v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.18015v1-abstract-full" style="display: none;"> In software maintenance, bug reproduction is essential for effective fault localization and repair. Manually writing reproduction scripts is a time-consuming task with high requirements for developers. Hence, automation of bug reproduction has increasingly attracted attention from researchers and practitioners. However, the existing studies on bug reproduction are generally limited to specific bug types such as program crashes, and hard to be applied to general bug reproduction. In this paper, considering the superior performance of agent-based methods in code intelligence tasks, we focus on designing an agent-based framework for the task. Directly employing agents would lead to limited bug reproduction performance, due to entangled subtasks, lengthy retrieved context, and unregulated actions. To mitigate the challenges, we propose an Automated gEneral buG reproductIon Scripts generation framework, named AEGIS, which is the first agent-based framework for the task. AEGIS mainly contains two modules: (1) A concise context construction module, which aims to guide the code agent in extracting structured information from issue descriptions, identifying issue-related code with detailed explanations, and integrating these elements to construct the concise context; (2) A FSM-based multi-feedback optimization module to further regulate the behavior of the code agent within the finite state machine (FSM), ensuring a controlled and efficient script generation process based on multi-dimensional feedback. Extensive experiments on the public benchmark dataset show that AEGIS outperforms the state-of-the-art baseline by 23.0% in F->P metric. In addition, the bug reproduction scripts generated by AEGIS can improve the relative resolved rate of Agentless by 12.5%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18015v1-abstract-full').style.display = 'none'; document.getElementById('2411.18015v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.17067">arXiv:2411.17067</a> <span> [<a href="https://arxiv.org/pdf/2411.17067">pdf</a>, <a href="https://arxiv.org/format/2411.17067">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> Geometry Field Splatting with Gaussian Surfels </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+K">Kaiwen Jiang</a>, <a href="/search/cs?searchtype=author&query=Sivaram%2C+V">Venkataram Sivaram</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+C">Cheng Peng</a>, <a href="/search/cs?searchtype=author&query=Ramamoorthi%2C+R">Ravi Ramamoorthi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.17067v1-abstract-short" style="display: inline;"> Geometric reconstruction of opaque surfaces from images is a longstanding challenge in computer vision, with renewed interest from volumetric view synthesis algorithms using radiance fields. We leverage the geometry field proposed in recent work for stochastic opaque surfaces, which can then be converted to volume densities. We adapt Gaussian kernels or surfels to splat the geometry field rather t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17067v1-abstract-full').style.display = 'inline'; document.getElementById('2411.17067v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.17067v1-abstract-full" style="display: none;"> Geometric reconstruction of opaque surfaces from images is a longstanding challenge in computer vision, with renewed interest from volumetric view synthesis algorithms using radiance fields. We leverage the geometry field proposed in recent work for stochastic opaque surfaces, which can then be converted to volume densities. We adapt Gaussian kernels or surfels to splat the geometry field rather than the volume, enabling precise reconstruction of opaque solids. Our first contribution is to derive an efficient and almost exact differentiable rendering algorithm for geometry fields parameterized by Gaussian surfels, while removing current approximations involving Taylor series and no self-attenuation. Next, we address the discontinuous loss landscape when surfels cluster near geometry, showing how to guarantee that the rendered color is a continuous function of the colors of the kernels, irrespective of ordering. Finally, we use latent representations with spherical harmonics encoded reflection vectors rather than spherical harmonics encoded colors to better address specular surfaces. We demonstrate significant improvement in the quality of reconstructed 3D surfaces on widely-used datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17067v1-abstract-full').style.display = 'none'; document.getElementById('2411.17067v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12592">arXiv:2411.12592</a> <span> [<a href="https://arxiv.org/pdf/2411.12592">pdf</a>, <a href="https://arxiv.org/format/2411.12592">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SPARS3R: Semantic Prior Alignment and Regularization for Sparse 3D Reconstruction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tang%2C+Y">Yutao Tang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Y">Yuxiang Guo</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Deming Li</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+C">Cheng Peng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12592v1-abstract-short" style="display: inline;"> Recent efforts in Gaussian-Splat-based Novel View Synthesis can achieve photorealistic rendering; however, such capability is limited in sparse-view scenarios due to sparse initialization and over-fitting floaters. Recent progress in depth estimation and alignment can provide dense point cloud with few views; however, the resulting pose accuracy is suboptimal. In this work, we present SPARS3R, whi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12592v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12592v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12592v1-abstract-full" style="display: none;"> Recent efforts in Gaussian-Splat-based Novel View Synthesis can achieve photorealistic rendering; however, such capability is limited in sparse-view scenarios due to sparse initialization and over-fitting floaters. Recent progress in depth estimation and alignment can provide dense point cloud with few views; however, the resulting pose accuracy is suboptimal. In this work, we present SPARS3R, which combines the advantages of accurate pose estimation from Structure-from-Motion and dense point cloud from depth estimation. To this end, SPARS3R first performs a Global Fusion Alignment process that maps a prior dense point cloud to a sparse point cloud from Structure-from-Motion based on triangulated correspondences. RANSAC is applied during this process to distinguish inliers and outliers. SPARS3R then performs a second, Semantic Outlier Alignment step, which extracts semantically coherent regions around the outliers and performs local alignment in these regions. Along with several improvements in the evaluation process, we demonstrate that SPARS3R can achieve photorealistic rendering with sparse images and significantly outperforms existing approaches. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12592v1-abstract-full').style.display = 'none'; document.getElementById('2411.12592v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11921">arXiv:2411.11921</a> <span> [<a href="https://arxiv.org/pdf/2411.11921">pdf</a>, <a href="https://arxiv.org/format/2411.11921">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> DeSiRe-GS: 4D Street Gaussians for Static-Dynamic Decomposition and Surface Reconstruction for Urban Driving Scenes </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Peng%2C+C">Chensheng Peng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chengwei Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yixiao Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+C">Chenfeng Xu</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+Y">Yichen Xie</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+W">Wenzhao Zheng</a>, <a href="/search/cs?searchtype=author&query=Keutzer%2C+K">Kurt Keutzer</a>, <a href="/search/cs?searchtype=author&query=Tomizuka%2C+M">Masayoshi Tomizuka</a>, <a href="/search/cs?searchtype=author&query=Zhan%2C+W">Wei Zhan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11921v1-abstract-short" style="display: inline;"> We present DeSiRe-GS, a self-supervised gaussian splatting representation, enabling effective static-dynamic decomposition and high-fidelity surface reconstruction in complex driving scenarios. Our approach employs a two-stage optimization pipeline of dynamic street Gaussians. In the first stage, we extract 2D motion masks based on the observation that 3D Gaussian Splatting inherently can reconstr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11921v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11921v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11921v1-abstract-full" style="display: none;"> We present DeSiRe-GS, a self-supervised gaussian splatting representation, enabling effective static-dynamic decomposition and high-fidelity surface reconstruction in complex driving scenarios. Our approach employs a two-stage optimization pipeline of dynamic street Gaussians. In the first stage, we extract 2D motion masks based on the observation that 3D Gaussian Splatting inherently can reconstruct only the static regions in dynamic environments. These extracted 2D motion priors are then mapped into the Gaussian space in a differentiable manner, leveraging an efficient formulation of dynamic Gaussians in the second stage. Combined with the introduced geometric regularizations, our method are able to address the over-fitting issues caused by data sparsity in autonomous driving, reconstructing physically plausible Gaussians that align with object surfaces rather than floating in air. Furthermore, we introduce temporal cross-view consistency to ensure coherence across time and viewpoints, resulting in high-quality surface reconstruction. Comprehensive experiments demonstrate the efficiency and effectiveness of DeSiRe-GS, surpassing prior self-supervised arts and achieving accuracy comparable to methods relying on external 3D bounding box annotations. Code is available at \url{https://github.com/chengweialan/DeSiRe-GS} <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11921v1-abstract-full').style.display = 'none'; document.getElementById('2411.11921v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11912">arXiv:2411.11912</a> <span> [<a href="https://arxiv.org/pdf/2411.11912">pdf</a>, <a href="https://arxiv.org/format/2411.11912">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> F$^3$OCUS -- Federated Finetuning of Vision-Language Foundation Models with Optimal Client Layer Updating Strategy via Multi-objective Meta-Heuristics </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Saha%2C+P">Pramit Saha</a>, <a href="/search/cs?searchtype=author&query=Wagner%2C+F">Felix Wagner</a>, <a href="/search/cs?searchtype=author&query=Mishra%2C+D">Divyanshu Mishra</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+C">Can Peng</a>, <a href="/search/cs?searchtype=author&query=Thakur%2C+A">Anshul Thakur</a>, <a href="/search/cs?searchtype=author&query=Clifton%2C+D">David Clifton</a>, <a href="/search/cs?searchtype=author&query=Kamnitsas%2C+K">Konstantinos Kamnitsas</a>, <a href="/search/cs?searchtype=author&query=Noble%2C+J+A">J. Alison Noble</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11912v1-abstract-short" style="display: inline;"> Effective training of large Vision-Language Models (VLMs) on resource-constrained client devices in Federated Learning (FL) requires the usage of parameter-efficient fine-tuning (PEFT) strategies. To this end, we demonstrate the impact of two factors \textit{viz.}, client-specific layer importance score that selects the most important VLM layers for fine-tuning and inter-client layer diversity sco… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11912v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11912v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11912v1-abstract-full" style="display: none;"> Effective training of large Vision-Language Models (VLMs) on resource-constrained client devices in Federated Learning (FL) requires the usage of parameter-efficient fine-tuning (PEFT) strategies. To this end, we demonstrate the impact of two factors \textit{viz.}, client-specific layer importance score that selects the most important VLM layers for fine-tuning and inter-client layer diversity score that encourages diverse layer selection across clients for optimal VLM layer selection. We first theoretically motivate and leverage the principal eigenvalue magnitude of layerwise Neural Tangent Kernels and show its effectiveness as client-specific layer importance score. Next, we propose a novel layer updating strategy dubbed F$^3$OCUS that jointly optimizes the layer importance and diversity factors by employing a data-free, multi-objective, meta-heuristic optimization on the server. We explore 5 different meta-heuristic algorithms and compare their effectiveness for selecting model layers and adapter layers towards PEFT-FL. Furthermore, we release a new MedVQA-FL dataset involving overall 707,962 VQA triplets and 9 modality-specific clients and utilize it to train and evaluate our method. Overall, we conduct more than 10,000 client-level experiments on 6 Vision-Language FL task settings involving 58 medical image datasets and 4 different VLM architectures of varying sizes to demonstrate the effectiveness of the proposed method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11912v1-abstract-full').style.display = 'none'; document.getElementById('2411.11912v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10213">arXiv:2411.10213</a> <span> [<a href="https://arxiv.org/pdf/2411.10213">pdf</a>, <a href="https://arxiv.org/format/2411.10213">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> An Empirical Study on LLM-based Agents for Automated Bug Fixing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Meng%2C+X">Xiangxin Meng</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Z">Zexiong Ma</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+P">Pengfei Gao</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+C">Chao Peng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10213v1-abstract-short" style="display: inline;"> Large language models (LLMs) and LLM-based Agents have been applied to fix bugs automatically, demonstrating the capability in addressing software defects by engaging in development environment interaction, iterative validation and code modification. However, systematic analysis of these agent and non-agent systems remain limited, particularly regarding performance variations among top-performing… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10213v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10213v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10213v1-abstract-full" style="display: none;"> Large language models (LLMs) and LLM-based Agents have been applied to fix bugs automatically, demonstrating the capability in addressing software defects by engaging in development environment interaction, iterative validation and code modification. However, systematic analysis of these agent and non-agent systems remain limited, particularly regarding performance variations among top-performing ones. In this paper, we examine seven proprietary and open-source systems on the SWE-bench Lite benchmark for automated bug fixing. We first assess each system's overall performance, noting instances solvable by all or none of these sytems, and explore why some instances are uniquely solved by specific system types. We also compare fault localization accuracy at file and line levels and evaluate bug reproduction capabilities, identifying instances solvable only through dynamic reproduction. Through analysis, we concluded that further optimization is needed in both the LLM itself and the design of Agentic flow to improve the effectiveness of the Agent in bug fixing. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10213v1-abstract-full').style.display = 'none'; document.getElementById('2411.10213v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06634">arXiv:2411.06634</a> <span> [<a href="https://arxiv.org/pdf/2411.06634">pdf</a>, <a href="https://arxiv.org/format/2411.06634">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Inductive Graph Few-shot Class Incremental Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yayong Li</a>, <a href="/search/cs?searchtype=author&query=Moghadam%2C+P">Peyman Moghadam</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+C">Can Peng</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+N">Nan Ye</a>, <a href="/search/cs?searchtype=author&query=Koniusz%2C+P">Piotr Koniusz</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06634v1-abstract-short" style="display: inline;"> Node classification with Graph Neural Networks (GNN) under a fixed set of labels is well known in contrast to Graph Few-Shot Class Incremental Learning (GFSCIL), which involves learning a GNN classifier as graph nodes and classes growing over time sporadically. We introduce inductive GFSCIL that continually learns novel classes with newly emerging nodes while maintaining performance on old classes… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06634v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06634v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06634v1-abstract-full" style="display: none;"> Node classification with Graph Neural Networks (GNN) under a fixed set of labels is well known in contrast to Graph Few-Shot Class Incremental Learning (GFSCIL), which involves learning a GNN classifier as graph nodes and classes growing over time sporadically. We introduce inductive GFSCIL that continually learns novel classes with newly emerging nodes while maintaining performance on old classes without accessing previous data. This addresses the practical concern of transductive GFSCIL, which requires storing the entire graph with historical data. Compared to the transductive GFSCIL, the inductive setting exacerbates catastrophic forgetting due to inaccessible previous data during incremental training, in addition to overfitting issue caused by label sparsity. Thus, we propose a novel method, called Topology-based class Augmentation and Prototype calibration (TAP). To be specific, it first creates a triple-branch multi-topology class augmentation method to enhance model generalization ability. As each incremental session receives a disjoint subgraph with nodes of novel classes, the multi-topology class augmentation method helps replicate such a setting in the base session to boost backbone versatility. In incremental learning, given the limited number of novel class samples, we propose an iterative prototype calibration to improve the separation of class prototypes. Furthermore, as backbone fine-tuning poses the feature distribution drift, prototypes of old classes start failing over time, we propose the prototype shift method for old classes to compensate for the drift. We showcase the proposed method on four datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06634v1-abstract-full').style.display = 'none'; document.getElementById('2411.06634v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03619">arXiv:2411.03619</a> <span> [<a href="https://arxiv.org/pdf/2411.03619">pdf</a>, <a href="https://arxiv.org/format/2411.03619">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Real-Time Safe Bipedal Robot Navigation using Linear Discrete Control Barrier Functions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Peng%2C+C">Chengyang Peng</a>, <a href="/search/cs?searchtype=author&query=Paredes%2C+V">Victor Paredes</a>, <a href="/search/cs?searchtype=author&query=Castillo%2C+G+A">Guillermo A. Castillo</a>, <a href="/search/cs?searchtype=author&query=Hereid%2C+A">Ayonga Hereid</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03619v1-abstract-short" style="display: inline;"> Safe navigation in real-time is an essential task for humanoid robots in real-world deployment. Since humanoid robots are inherently underactuated thanks to unilateral ground contacts, a path is considered safe if it is obstacle-free and respects the robot's physical limitations and underlying dynamics. Existing approaches often decouple path planning from gait control due to the significant compu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03619v1-abstract-full').style.display = 'inline'; document.getElementById('2411.03619v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03619v1-abstract-full" style="display: none;"> Safe navigation in real-time is an essential task for humanoid robots in real-world deployment. Since humanoid robots are inherently underactuated thanks to unilateral ground contacts, a path is considered safe if it is obstacle-free and respects the robot's physical limitations and underlying dynamics. Existing approaches often decouple path planning from gait control due to the significant computational challenge caused by the full-order robot dynamics. In this work, we develop a unified, safe path and gait planning framework that can be evaluated online in real-time, allowing the robot to navigate clustered environments while sustaining stable locomotion. Our approach uses the popular Linear Inverted Pendulum (LIP) model as a template model to represent walking dynamics. It incorporates heading angles in the model to evaluate kinematic constraints essential for physically feasible gaits properly. In addition, we leverage discrete control barrier functions (DCBF) for obstacle avoidance, ensuring that the subsequent foot placement provides a safe navigation path within clustered environments. To guarantee real-time computation, we use a novel approximation of the DCBF to produce linear DCBF (LDCBF) constraints. We validate the proposed approach in simulation using a Digit robot in randomly generated environments. The results demonstrate that our approach can generate safe gaits for a non-trivial humanoid robot to navigate environments with randomly generated obstacles in real-time. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03619v1-abstract-full').style.display = 'none'; document.getElementById('2411.03619v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages, 10 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01123">arXiv:2411.01123</a> <span> [<a href="https://arxiv.org/pdf/2411.01123">pdf</a>, <a href="https://arxiv.org/format/2411.01123">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> X-Drive: Cross-modality consistent multi-sensor data synthesis for driving scenarios </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xie%2C+Y">Yichen Xie</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+C">Chenfeng Xu</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+C">Chensheng Peng</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Shuqi Zhao</a>, <a href="/search/cs?searchtype=author&query=Ho%2C+N">Nhat Ho</a>, <a href="/search/cs?searchtype=author&query=Pham%2C+A+T">Alexander T. Pham</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+M">Mingyu Ding</a>, <a href="/search/cs?searchtype=author&query=Tomizuka%2C+M">Masayoshi Tomizuka</a>, <a href="/search/cs?searchtype=author&query=Zhan%2C+W">Wei Zhan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01123v1-abstract-short" style="display: inline;"> Recent advancements have exploited diffusion models for the synthesis of either LiDAR point clouds or camera image data in driving scenarios. Despite their success in modeling single-modality data marginal distribution, there is an under-exploration in the mutual reliance between different modalities to describe complex driving scenes. To fill in this gap, we propose a novel framework, X-DRIVE, to… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01123v1-abstract-full').style.display = 'inline'; document.getElementById('2411.01123v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01123v1-abstract-full" style="display: none;"> Recent advancements have exploited diffusion models for the synthesis of either LiDAR point clouds or camera image data in driving scenarios. Despite their success in modeling single-modality data marginal distribution, there is an under-exploration in the mutual reliance between different modalities to describe complex driving scenes. To fill in this gap, we propose a novel framework, X-DRIVE, to model the joint distribution of point clouds and multi-view images via a dual-branch latent diffusion model architecture. Considering the distinct geometrical spaces of the two modalities, X-DRIVE conditions the synthesis of each modality on the corresponding local regions from the other modality, ensuring better alignment and realism. To further handle the spatial ambiguity during denoising, we design the cross-modality condition module based on epipolar lines to adaptively learn the cross-modality local correspondence. Besides, X-DRIVE allows for controllable generation through multi-level input conditions, including text, bounding box, image, and point clouds. Extensive results demonstrate the high-fidelity synthetic results of X-DRIVE for both point clouds and multi-view images, adhering to input conditions while ensuring reliable cross-modality consistency. Our code will be made publicly available at https://github.com/yichen928/X-Drive. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01123v1-abstract-full').style.display = 'none'; document.getElementById('2411.01123v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20723">arXiv:2410.20723</a> <span> [<a href="https://arxiv.org/pdf/2410.20723">pdf</a>, <a href="https://arxiv.org/format/2410.20723">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> CompGS: Unleashing 2D Compositionality for Compositional Text-to-3D via Dynamically Optimizing 3D Gaussians </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ge%2C+C">Chongjian Ge</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+C">Chenfeng Xu</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+Y">Yuanfeng Ji</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+C">Chensheng Peng</a>, <a href="/search/cs?searchtype=author&query=Tomizuka%2C+M">Masayoshi Tomizuka</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+P">Ping Luo</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+M">Mingyu Ding</a>, <a href="/search/cs?searchtype=author&query=Jampani%2C+V">Varun Jampani</a>, <a href="/search/cs?searchtype=author&query=Zhan%2C+W">Wei Zhan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20723v1-abstract-short" style="display: inline;"> Recent breakthroughs in text-guided image generation have significantly advanced the field of 3D generation. While generating a single high-quality 3D object is now feasible, generating multiple objects with reasonable interactions within a 3D space, a.k.a. compositional 3D generation, presents substantial challenges. This paper introduces CompGS, a novel generative framework that employs 3D Gauss… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20723v1-abstract-full').style.display = 'inline'; document.getElementById('2410.20723v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20723v1-abstract-full" style="display: none;"> Recent breakthroughs in text-guided image generation have significantly advanced the field of 3D generation. While generating a single high-quality 3D object is now feasible, generating multiple objects with reasonable interactions within a 3D space, a.k.a. compositional 3D generation, presents substantial challenges. This paper introduces CompGS, a novel generative framework that employs 3D Gaussian Splatting (GS) for efficient, compositional text-to-3D content generation. To achieve this goal, two core designs are proposed: (1) 3D Gaussians Initialization with 2D compositionality: We transfer the well-established 2D compositionality to initialize the Gaussian parameters on an entity-by-entity basis, ensuring both consistent 3D priors for each entity and reasonable interactions among multiple entities; (2) Dynamic Optimization: We propose a dynamic strategy to optimize 3D Gaussians using Score Distillation Sampling (SDS) loss. CompGS first automatically decomposes 3D Gaussians into distinct entity parts, enabling optimization at both the entity and composition levels. Additionally, CompGS optimizes across objects of varying scales by dynamically adjusting the spatial parameters of each entity, enhancing the generation of fine-grained details, particularly in smaller entities. Qualitative comparisons and quantitative evaluations on T3Bench demonstrate the effectiveness of CompGS in generating compositional 3D objects with superior image quality and semantic alignment over existing methods. CompGS can also be easily extended to controllable 3D editing, facilitating scene generation. We hope CompGS will provide new insights to the compositional 3D generation. Project page: https://chongjiange.github.io/compgs.html. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20723v1-abstract-full').style.display = 'none'; document.getElementById('2410.20723v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.19817">arXiv:2410.19817</a> <span> [<a href="https://arxiv.org/pdf/2410.19817">pdf</a>, <a href="https://arxiv.org/format/2410.19817">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Step Guided Reasoning: Improving Mathematical Reasoning using Guidance Generation and Step Reasoning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cao%2C+L">Lang Cao</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+C">Chao Peng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yitong Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.19817v1-abstract-short" style="display: inline;"> Mathematical reasoning has been a challenging aspect of large language models (LLMs). However, the introduction of step-by-step Chain-of-Thought (CoT) inference has significantly advanced the mathematical capabilities of LLMs. Despite this progress, current approaches either require massive inference datasets as training datasets or rely on few-shot methods that often sacrifice accuracy. To addres… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19817v1-abstract-full').style.display = 'inline'; document.getElementById('2410.19817v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.19817v1-abstract-full" style="display: none;"> Mathematical reasoning has been a challenging aspect of large language models (LLMs). However, the introduction of step-by-step Chain-of-Thought (CoT) inference has significantly advanced the mathematical capabilities of LLMs. Despite this progress, current approaches either require massive inference datasets as training datasets or rely on few-shot methods that often sacrifice accuracy. To address this bottleneck in mathematical reasoning, we propose a novel method called Step Guidance Reasoning without involving further model fine-tuning. In this approach, LLMs reflect on small reasoning steps -- similar to how humans deliberate on and focus attention on what to do next. By incorporating this reflective process into the inference stage, LLMs can effectively guide their reasoning from one step to the next. Our method significantly improved the math performance, raising the accuracy on the AMC23 dataset from 30% to 57.5%, a relative improvement of 91.7%, and on the sampled level 5 problem of the MATH dataset, we achieved a relative accuracy improvement of 55.8%, increasing from 43% to 67%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19817v1-abstract-full').style.display = 'none'; document.getElementById('2410.19817v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">4 pages, 4 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.17629">arXiv:2410.17629</a> <span> [<a href="https://arxiv.org/pdf/2410.17629">pdf</a>, <a href="https://arxiv.org/format/2410.17629">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Graph Signal Adaptive Message Passing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yan%2C+Y">Yi Yan</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+C">Changran Peng</a>, <a href="/search/cs?searchtype=author&query=Kuruoglu%2C+E+E">Ercan Engin Kuruoglu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.17629v2-abstract-short" style="display: inline;"> This paper proposes Graph Signal Adaptive Message Passing (GSAMP), a novel message passing method that simultaneously conducts online prediction, missing data imputation, and noise removal on time-varying graph signals. Unlike conventional Graph Signal Processing methods that apply the same filter to the entire graph, the spatiotemporal updates of GSAMP employ a distinct approach that utilizes loc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17629v2-abstract-full').style.display = 'inline'; document.getElementById('2410.17629v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.17629v2-abstract-full" style="display: none;"> This paper proposes Graph Signal Adaptive Message Passing (GSAMP), a novel message passing method that simultaneously conducts online prediction, missing data imputation, and noise removal on time-varying graph signals. Unlike conventional Graph Signal Processing methods that apply the same filter to the entire graph, the spatiotemporal updates of GSAMP employ a distinct approach that utilizes localized computations at each node. This update is based on an adaptive solution obtained from an optimization problem designed to minimize the discrepancy between observed and estimated values. GSAMP effectively processes real-world, time-varying graph signals under Gaussian and impulsive noise conditions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17629v2-abstract-full').style.display = 'none'; document.getElementById('2410.17629v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.13720">arXiv:2410.13720</a> <span> [<a href="https://arxiv.org/pdf/2410.13720">pdf</a>, <a href="https://arxiv.org/format/2410.13720">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Movie Gen: A Cast of Media Foundation Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Polyak%2C+A">Adam Polyak</a>, <a href="/search/cs?searchtype=author&query=Zohar%2C+A">Amit Zohar</a>, <a href="/search/cs?searchtype=author&query=Brown%2C+A">Andrew Brown</a>, <a href="/search/cs?searchtype=author&query=Tjandra%2C+A">Andros Tjandra</a>, <a href="/search/cs?searchtype=author&query=Sinha%2C+A">Animesh Sinha</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+A">Ann Lee</a>, <a href="/search/cs?searchtype=author&query=Vyas%2C+A">Apoorv Vyas</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+B">Bowen Shi</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+C">Chih-Yao Ma</a>, <a href="/search/cs?searchtype=author&query=Chuang%2C+C">Ching-Yao Chuang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+D">David Yan</a>, <a href="/search/cs?searchtype=author&query=Choudhary%2C+D">Dhruv Choudhary</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+D">Dingkang Wang</a>, <a href="/search/cs?searchtype=author&query=Sethi%2C+G">Geet Sethi</a>, <a href="/search/cs?searchtype=author&query=Pang%2C+G">Guan Pang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+H">Haoyu Ma</a>, <a href="/search/cs?searchtype=author&query=Misra%2C+I">Ishan Misra</a>, <a href="/search/cs?searchtype=author&query=Hou%2C+J">Ji Hou</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jialiang Wang</a>, <a href="/search/cs?searchtype=author&query=Jagadeesh%2C+K">Kiran Jagadeesh</a>, <a href="/search/cs?searchtype=author&query=Li%2C+K">Kunpeng Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Luxin Zhang</a>, <a href="/search/cs?searchtype=author&query=Singh%2C+M">Mannat Singh</a>, <a href="/search/cs?searchtype=author&query=Williamson%2C+M">Mary Williamson</a>, <a href="/search/cs?searchtype=author&query=Le%2C+M">Matt Le</a> , et al. (63 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.13720v1-abstract-short" style="display: inline;"> We present Movie Gen, a cast of foundation models that generates high-quality, 1080p HD videos with different aspect ratios and synchronized audio. We also show additional capabilities such as precise instruction-based video editing and generation of personalized videos based on a user's image. Our models set a new state-of-the-art on multiple tasks: text-to-video synthesis, video personalization,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13720v1-abstract-full').style.display = 'inline'; document.getElementById('2410.13720v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.13720v1-abstract-full" style="display: none;"> We present Movie Gen, a cast of foundation models that generates high-quality, 1080p HD videos with different aspect ratios and synchronized audio. We also show additional capabilities such as precise instruction-based video editing and generation of personalized videos based on a user's image. Our models set a new state-of-the-art on multiple tasks: text-to-video synthesis, video personalization, video editing, video-to-audio generation, and text-to-audio generation. Our largest video generation model is a 30B parameter transformer trained with a maximum context length of 73K video tokens, corresponding to a generated video of 16 seconds at 16 frames-per-second. We show multiple technical innovations and simplifications on the architecture, latent spaces, training objectives and recipes, data curation, evaluation protocols, parallelization techniques, and inference optimizations that allow us to reap the benefits of scaling pre-training data, model size, and training compute for training large scale media generation models. We hope this paper helps the research community to accelerate progress and innovation in media generation models. All videos from this paper are available at https://go.fb.me/MovieGenResearchVideos. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13720v1-abstract-full').style.display = 'none'; document.getElementById('2410.13720v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.12928">arXiv:2410.12928</a> <span> [<a href="https://arxiv.org/pdf/2410.12928">pdf</a>, <a href="https://arxiv.org/format/2410.12928">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> DreamCraft3D++: Efficient Hierarchical 3D Generation with Multi-Plane Reconstruction Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+J">Jingxiang Sun</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+C">Cheng Peng</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+R">Ruizhi Shao</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Y">Yuan-Chen Guo</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+X">Xiaochen Zhao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yangguang Li</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+Y">Yanpei Cao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Bo Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yebin Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.12928v1-abstract-short" style="display: inline;"> We introduce DreamCraft3D++, an extension of DreamCraft3D that enables efficient high-quality generation of complex 3D assets. DreamCraft3D++ inherits the multi-stage generation process of DreamCraft3D, but replaces the time-consuming geometry sculpting optimization with a feed-forward multi-plane based reconstruction model, speeding up the process by 1000x. For texture refinement, we propose a tr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12928v1-abstract-full').style.display = 'inline'; document.getElementById('2410.12928v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.12928v1-abstract-full" style="display: none;"> We introduce DreamCraft3D++, an extension of DreamCraft3D that enables efficient high-quality generation of complex 3D assets. DreamCraft3D++ inherits the multi-stage generation process of DreamCraft3D, but replaces the time-consuming geometry sculpting optimization with a feed-forward multi-plane based reconstruction model, speeding up the process by 1000x. For texture refinement, we propose a training-free IP-Adapter module that is conditioned on the enhanced multi-view images to enhance texture and geometry consistency, providing a 4x faster alternative to DreamCraft3D's DreamBooth fine-tuning. Experiments on diverse datasets demonstrate DreamCraft3D++'s ability to generate creative 3D assets with intricate geometry and realistic 360掳 textures, outperforming state-of-the-art image-to-3D methods in quality and speed. The full implementation will be open-sourced to enable new possibilities in 3D content creation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12928v1-abstract-full').style.display = 'none'; document.getElementById('2410.12928v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project Page: https://dreamcraft3dplus.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.05707">arXiv:2410.05707</a> <span> [<a href="https://arxiv.org/pdf/2410.05707">pdf</a>, <a href="https://arxiv.org/format/2410.05707">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Network Topology Inference from Smooth Signals Under Partial Observability </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Peng%2C+C">Chuansen Peng</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+H">Hanning Tang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhiguo Wang</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+X">Xiaojing Shen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.05707v2-abstract-short" style="display: inline;"> Inferring network topology from smooth signals is a significant problem in data science and engineering. A common challenge in real-world scenarios is the availability of only partially observed nodes. While some studies have considered hidden nodes and proposed various optimization frameworks, existing methods often lack the practical efficiency needed for large-scale networks or fail to provide… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05707v2-abstract-full').style.display = 'inline'; document.getElementById('2410.05707v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.05707v2-abstract-full" style="display: none;"> Inferring network topology from smooth signals is a significant problem in data science and engineering. A common challenge in real-world scenarios is the availability of only partially observed nodes. While some studies have considered hidden nodes and proposed various optimization frameworks, existing methods often lack the practical efficiency needed for large-scale networks or fail to provide theoretical convergence guarantees. In this paper, we address the problem of inferring network topologies from smooth signals with partially observed nodes. We propose a first-order algorithmic framework that includes two variants: one based on column sparsity regularization and the other on a low-rank constraint. We establish theoretical convergence guarantees and demonstrate the linear convergence rate of our algorithms. Extensive experiments on both synthetic and real-world data show that our results align with theoretical predictions, exhibiting not only linear convergence but also superior speed compared to existing methods. To the best of our knowledge, this is the first work to propose a first-order algorithmic framework for inferring network structures from smooth signals under partial observability, offering both guaranteed linear convergence and practical effectiveness for large-scale networks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05707v2-abstract-full').style.display = 'none'; document.getElementById('2410.05707v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.20396">arXiv:2409.20396</a> <span> [<a href="https://arxiv.org/pdf/2409.20396">pdf</a>, <a href="https://arxiv.org/ps/2409.20396">ps</a>, <a href="https://arxiv.org/format/2409.20396">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> </div> </div> <p class="title is-5 mathjax"> Facility Location Games with Competitors </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Peng%2C+C">Cheng Peng</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+H">Houyu Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.20396v1-abstract-short" style="display: inline;"> In this paper, we consider facility location games with competitors where the agents are divided into groups and the agents in the same group have competitive relationships, i.e., the cost of an agent will increase if the facility is closer to their competitors. We consider three types of misreporting: misreporting the location only, misreporting the group membership only, and misreporting both. T… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.20396v1-abstract-full').style.display = 'inline'; document.getElementById('2409.20396v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.20396v1-abstract-full" style="display: none;"> In this paper, we consider facility location games with competitors where the agents are divided into groups and the agents in the same group have competitive relationships, i.e., the cost of an agent will increase if the facility is closer to their competitors. We consider three types of misreporting: misreporting the location only, misreporting the group membership only, and misreporting both. To minimize the social cost, we propose a strategyproof mechanism that is optimal when misreporting the location only. For the other two types of manipulation, we reuse the median mechanism and achieve tight bounds of 2. To minimize the maximum cost, we design new strategyproof mechanisms for the first two types of misreporting. We reuse the leftmost mechanism for misreporting both. All bounds are almost tight. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.20396v1-abstract-full').style.display = 'none'; document.getElementById('2409.20396v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.15342">arXiv:2409.15342</a> <span> [<a href="https://arxiv.org/pdf/2409.15342">pdf</a>, <a href="https://arxiv.org/format/2409.15342">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Recall: Empowering Multimodal Embedding for Edge Devices </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cai%2C+D">Dongqi Cai</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shangguang Wang</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+C">Chen Peng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zeling Zhang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+M">Mengwei Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.15342v1-abstract-short" style="display: inline;"> Human memory is inherently prone to forgetting. To address this, multimodal embedding models have been introduced, which transform diverse real-world data into a unified embedding space. These embeddings can be retrieved efficiently, aiding mobile users in recalling past information. However, as model complexity grows, so do its resource demands, leading to reduced throughput and heavy computation… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15342v1-abstract-full').style.display = 'inline'; document.getElementById('2409.15342v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.15342v1-abstract-full" style="display: none;"> Human memory is inherently prone to forgetting. To address this, multimodal embedding models have been introduced, which transform diverse real-world data into a unified embedding space. These embeddings can be retrieved efficiently, aiding mobile users in recalling past information. However, as model complexity grows, so do its resource demands, leading to reduced throughput and heavy computational requirements that limit mobile device implementation. In this paper, we introduce RECALL, a novel on-device multimodal embedding system optimized for resource-limited mobile environments. RECALL achieves high-throughput, accurate retrieval by generating coarse-grained embeddings and leveraging query-based filtering for refined retrieval. Experimental results demonstrate that RECALL delivers high-quality embeddings with superior throughput, all while operating unobtrusively with minimal memory and energy consumption. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15342v1-abstract-full').style.display = 'none'; document.getElementById('2409.15342v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.15132">arXiv:2409.15132</a> <span> [<a href="https://arxiv.org/pdf/2409.15132">pdf</a>, <a href="https://arxiv.org/format/2409.15132">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> FusionRF: High-Fidelity Satellite Neural Radiance Fields from Multispectral and Panchromatic Acquisitions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sprintson%2C+M">Michael Sprintson</a>, <a href="/search/cs?searchtype=author&query=Chellappa%2C+R">Rama Chellappa</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+C">Cheng Peng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.15132v1-abstract-short" style="display: inline;"> We introduce FusionRF, a novel neural rendering terrain reconstruction method from optically unprocessed satellite imagery. While previous methods depend on external pansharpening methods to fuse low resolution multispectral imagery and high resolution panchromatic imagery, FusionRF directly performs reconstruction based on optically unprocessed acquisitions with no prior knowledge. This is accomp… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15132v1-abstract-full').style.display = 'inline'; document.getElementById('2409.15132v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.15132v1-abstract-full" style="display: none;"> We introduce FusionRF, a novel neural rendering terrain reconstruction method from optically unprocessed satellite imagery. While previous methods depend on external pansharpening methods to fuse low resolution multispectral imagery and high resolution panchromatic imagery, FusionRF directly performs reconstruction based on optically unprocessed acquisitions with no prior knowledge. This is accomplished through the addition of a sharpening kernel which models the resolution loss in multispectral images. Additionally, novel modal embeddings allow the model to perform image fusion as a bottleneck to novel view synthesis. We evaluate our method on multispectral and panchromatic satellite images from the WorldView-3 satellite in various locations, and FusionRF outperforms previous State-of-The-Art methods in depth reconstruction on unprocessed imagery, renders sharp training and novel views, and retains multi-spectral information. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15132v1-abstract-full').style.display = 'none'; document.getElementById('2409.15132v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.12680">arXiv:2409.12680</a> <span> [<a href="https://arxiv.org/pdf/2409.12680">pdf</a>, <a href="https://arxiv.org/format/2409.12680">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Exploiting Minority Pseudo-Labels for Semi-Supervised Semantic Segmentation in Autonomous Driving </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hong%2C+Y">Yuting Hong</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+H">Hui Xiao</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+H">Huazheng Hao</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+X">Xiaojie Qiu</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+B">Baochen Yao</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+C">Chengbin Peng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.12680v2-abstract-short" style="display: inline;"> With the advancement of autonomous driving, semantic segmentation has achieved remarkable progress. The training of such networks heavily relies on image annotations, which are very expensive to obtain. Semi-supervised learning can utilize both labeled data and unlabeled data with the help of pseudo-labels. However, in many real-world scenarios where classes are imbalanced, majority classes often… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12680v2-abstract-full').style.display = 'inline'; document.getElementById('2409.12680v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.12680v2-abstract-full" style="display: none;"> With the advancement of autonomous driving, semantic segmentation has achieved remarkable progress. The training of such networks heavily relies on image annotations, which are very expensive to obtain. Semi-supervised learning can utilize both labeled data and unlabeled data with the help of pseudo-labels. However, in many real-world scenarios where classes are imbalanced, majority classes often play a dominant role during training and the learning quality of minority classes can be undermined. To overcome this limitation, we propose a synergistic training framework, including a professional training module to enhance minority class learning and a general training module to learn more comprehensive semantic information. Based on a pixel selection strategy, they can iteratively learn from each other to reduce error accumulation and coupling. In addition, a dual contrastive learning with anchors is proposed to guarantee more distinct decision boundaries. In experiments, our framework demonstrates superior performance compared to state-of-the-art methods on benchmark datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12680v2-abstract-full').style.display = 'none'; document.getElementById('2409.12680v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages, 8 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.05152">arXiv:2409.05152</a> <span> [<a href="https://arxiv.org/pdf/2409.05152">pdf</a>, <a href="https://arxiv.org/format/2409.05152">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> OneGen: Efficient One-Pass Unified Generation and Retrieval for LLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jintian Zhang</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+C">Cheng Peng</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+M">Mengshu Sun</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xiang Chen</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+L">Lei Liang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhiqiang Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jun Zhou</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Huajun Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+N">Ningyu Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.05152v2-abstract-short" style="display: inline;"> Despite the recent advancements in Large Language Models (LLMs), which have significantly enhanced the generative capabilities for various NLP tasks, LLMs still face limitations in directly handling retrieval tasks. However, many practical applications demand the seamless integration of both retrieval and generation. This paper introduces a novel and efficient One-pass Generation and retrieval fra… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.05152v2-abstract-full').style.display = 'inline'; document.getElementById('2409.05152v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.05152v2-abstract-full" style="display: none;"> Despite the recent advancements in Large Language Models (LLMs), which have significantly enhanced the generative capabilities for various NLP tasks, LLMs still face limitations in directly handling retrieval tasks. However, many practical applications demand the seamless integration of both retrieval and generation. This paper introduces a novel and efficient One-pass Generation and retrieval framework (OneGen), designed to improve LLMs' performance on tasks that require both generation and retrieval. The proposed framework bridges the traditionally separate training approaches for generation and retrieval by incorporating retrieval tokens generated autoregressively. This enables a single LLM to handle both tasks simultaneously in a unified forward pass. We conduct experiments on two distinct types of composite tasks, RAG and Entity Linking, to validate the pluggability, effectiveness, and efficiency of OneGen in training and inference. Furthermore, our results show that integrating generation and retrieval within the same context preserves the generative capabilities of LLMs while improving retrieval performance. To the best of our knowledge, OneGen is the first to enable LLMs to conduct vector retrieval during the generation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.05152v2-abstract-full').style.display = 'none'; document.getElementById('2409.05152v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">EMNLP 2024 Findings; code is available at https://github.com/zjunlp/OneGen</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.00899">arXiv:2409.00899</a> <span> [<a href="https://arxiv.org/pdf/2409.00899">pdf</a>, <a href="https://arxiv.org/format/2409.00899">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> MarsCode Agent: AI-native Automated Bug Fixing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yizhou Liu</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+P">Pengfei Gao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xinchen Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jie Liu</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yexuan Shi</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhao Zhang</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+C">Chao Peng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.00899v2-abstract-short" style="display: inline;"> Recent advances in large language models (LLMs) have shown significant potential to automate various software development tasks, including code completion, test generation, and bug fixing. However, the application of LLMs for automated bug fixing remains challenging due to the complexity and diversity of real-world software systems. In this paper, we introduce MarsCode Agent, a novel framework tha… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00899v2-abstract-full').style.display = 'inline'; document.getElementById('2409.00899v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.00899v2-abstract-full" style="display: none;"> Recent advances in large language models (LLMs) have shown significant potential to automate various software development tasks, including code completion, test generation, and bug fixing. However, the application of LLMs for automated bug fixing remains challenging due to the complexity and diversity of real-world software systems. In this paper, we introduce MarsCode Agent, a novel framework that leverages LLMs to automatically identify and repair bugs in software code. MarsCode Agent combines the power of LLMs with advanced code analysis techniques to accurately localize faults and generate patches. Our approach follows a systematic process of planning, bug reproduction, fault localization, candidate patch generation, and validation to ensure high-quality bug fixes. We evaluated MarsCode Agent on SWE-bench, a comprehensive benchmark of real-world software projects, and our results show that MarsCode Agent achieves a high success rate in bug fixing compared to most of the existing automated approaches. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00899v2-abstract-full').style.display = 'none'; document.getElementById('2409.00899v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Yizhou Liu and Pengfei Gao contributed equally and the order is determined by rolling the dice. Chao Peng is the corresponding author</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.15242">arXiv:2408.15242</a> <span> [<a href="https://arxiv.org/pdf/2408.15242">pdf</a>, <a href="https://arxiv.org/format/2408.15242">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Drone-assisted Road Gaussian Splatting with Cross-view Uncertainty </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Saining Zhang</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+B">Baijun Ye</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xiaoxue Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yuantao Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zongzheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+C">Cheng Peng</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yongliang Shi</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Hao Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.15242v1-abstract-short" style="display: inline;"> Robust and realistic rendering for large-scale road scenes is essential in autonomous driving simulation. Recently, 3D Gaussian Splatting (3D-GS) has made groundbreaking progress in neural rendering, but the general fidelity of large-scale road scene renderings is often limited by the input imagery, which usually has a narrow field of view and focuses mainly on the street-level local area. Intuiti… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.15242v1-abstract-full').style.display = 'inline'; document.getElementById('2408.15242v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.15242v1-abstract-full" style="display: none;"> Robust and realistic rendering for large-scale road scenes is essential in autonomous driving simulation. Recently, 3D Gaussian Splatting (3D-GS) has made groundbreaking progress in neural rendering, but the general fidelity of large-scale road scene renderings is often limited by the input imagery, which usually has a narrow field of view and focuses mainly on the street-level local area. Intuitively, the data from the drone's perspective can provide a complementary viewpoint for the data from the ground vehicle's perspective, enhancing the completeness of scene reconstruction and rendering. However, training naively with aerial and ground images, which exhibit large view disparity, poses a significant convergence challenge for 3D-GS, and does not demonstrate remarkable improvements in performance on road views. In order to enhance the novel view synthesis of road views and to effectively use the aerial information, we design an uncertainty-aware training method that allows aerial images to assist in the synthesis of areas where ground images have poor learning outcomes instead of weighting all pixels equally in 3D-GS training like prior work did. We are the first to introduce the cross-view uncertainty to 3D-GS by matching the car-view ensemble-based rendering uncertainty to aerial images, weighting the contribution of each pixel to the training process. Additionally, to systematically quantify evaluation metrics, we assemble a high-quality synthesized dataset comprising both aerial and ground images for road scenes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.15242v1-abstract-full').style.display = 'none'; document.getElementById('2408.15242v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">BMVC2024 Project Page: https://sainingzhang.github.io/project/uc-gs/ Code: https://github.com/SainingZhang/uc-gs/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.12792">arXiv:2408.12792</a> <span> [<a href="https://arxiv.org/pdf/2408.12792">pdf</a>, <a href="https://arxiv.org/format/2408.12792">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Event Detection via Probability Density Function Regression </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Peng%2C+C">Clark Peng</a>, <a href="/search/cs?searchtype=author&query=Din%C3%A7er%2C+T">Tolga Din莽er</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.12792v1-abstract-short" style="display: inline;"> In the domain of time series analysis, particularly in event detection tasks, current methodologies predominantly rely on segmentation-based approaches, which predict the class label for each individual timesteps and use the changepoints of these labels to detect events. However, these approaches may not effectively detect the precise onset and offset of events within the data and suffer from clas… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.12792v1-abstract-full').style.display = 'inline'; document.getElementById('2408.12792v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.12792v1-abstract-full" style="display: none;"> In the domain of time series analysis, particularly in event detection tasks, current methodologies predominantly rely on segmentation-based approaches, which predict the class label for each individual timesteps and use the changepoints of these labels to detect events. However, these approaches may not effectively detect the precise onset and offset of events within the data and suffer from class imbalance problems. This study introduces a generalized regression-based approach to reframe the time-interval-defined event detection problem. Inspired by heatmap regression techniques from computer vision, our approach aims to predict probability densities at event locations rather than class labels across the entire time series. The primary aim of this approach is to improve the accuracy of event detection methods, particularly for long-duration events where identifying the onset and offset is more critical than classifying individual event states. We demonstrate that regression-based approaches outperform segmentation-based methods across various state-of-the-art baseline networks and datasets, offering a more effective solution for specific event detection tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.12792v1-abstract-full').style.display = 'none'; document.getElementById('2408.12792v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.0; I.5.4 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.11240">arXiv:2408.11240</a> <span> [<a href="https://arxiv.org/pdf/2408.11240">pdf</a>, <a href="https://arxiv.org/ps/2408.11240">ps</a>, <a href="https://arxiv.org/format/2408.11240">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Asymmetric Graph Error Control with Low Complexity in Causal Bandits </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Peng%2C+C">Chen Peng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+D">Di Zhang</a>, <a href="/search/cs?searchtype=author&query=Mitra%2C+U">Urbashi Mitra</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.11240v1-abstract-short" style="display: inline;"> In this paper, the causal bandit problem is investigated, in which the objective is to select an optimal sequence of interventions on nodes in a causal graph. It is assumed that the graph is governed by linear structural equations; it is further assumed that both the causal topology and the distribution of interventions are unknown. By exploiting the causal relationships between the nodes whose si… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.11240v1-abstract-full').style.display = 'inline'; document.getElementById('2408.11240v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.11240v1-abstract-full" style="display: none;"> In this paper, the causal bandit problem is investigated, in which the objective is to select an optimal sequence of interventions on nodes in a causal graph. It is assumed that the graph is governed by linear structural equations; it is further assumed that both the causal topology and the distribution of interventions are unknown. By exploiting the causal relationships between the nodes whose signals contribute to the reward, interventions are optimized. First, based on the difference between the two types of graph identification errors (false positives and negatives), a causal graph learning method is proposed, which strongly reduces sample complexity relative to the prior art by learning sub-graphs. Under the assumption of Gaussian exogenous inputs and minimum-mean squared error weight estimation, a new uncertainty bound tailored to the causal bandit problem is derived. This uncertainty bound drives an upper confidence bound based intervention selection to optimize the reward. To cope with non-stationary bandits, a sub-graph change detection mechanism is proposed, with high sample efficiency. Numerical results compare the new methodology to existing schemes and show a substantial performance improvement in both stationary and non-stationary settings. Compared to existing approaches, the proposed scheme takes 67% fewer samples to learn the causal structure and achieves an average reward gain of 85%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.11240v1-abstract-full').style.display = 'none'; document.getElementById('2408.11240v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.08191">arXiv:2408.08191</a> <span> [<a href="https://arxiv.org/pdf/2408.08191">pdf</a>, <a href="https://arxiv.org/format/2408.08191">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Beyond Full Labels: Energy-Double-Guided Single-Point Prompt for Infrared Small Target Label Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yuan%2C+S">Shuai Yuan</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+H">Hanlin Qin</a>, <a href="/search/cs?searchtype=author&query=Kou%2C+R">Renke Kou</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+X">Xiang Yan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zechuan Li</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+C">Chenxu Peng</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+H">Huixin Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.08191v5-abstract-short" style="display: inline;"> We pioneer a learning-based single-point prompt paradigm for infrared small target label generation (IRSTLG) to lobber annotation burdens. Unlike previous clustering-based methods, our intuition is that point-guided mask generation just requires one more prompt than target detection, i.e., IRSTLG can be treated as an infrared small target detection (IRSTD) with the location hint. Therefore, we pro… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.08191v5-abstract-full').style.display = 'inline'; document.getElementById('2408.08191v5-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.08191v5-abstract-full" style="display: none;"> We pioneer a learning-based single-point prompt paradigm for infrared small target label generation (IRSTLG) to lobber annotation burdens. Unlike previous clustering-based methods, our intuition is that point-guided mask generation just requires one more prompt than target detection, i.e., IRSTLG can be treated as an infrared small target detection (IRSTD) with the location hint. Therefore, we propose an elegant yet effective Energy-Double-Guided Single-point Prompt (EDGSP) framework, aiming to adeptly transform a coarse IRSTD network into a refined label generation method. Specifically, EDGSP comprises three key modules: 1) target energy initialization (TEI), which establishes a foundational outline to streamline the mapping process for effective shape evolution, 2) double prompt embedding (DPE) for rapidly localizing interesting regions and reinforcing high-resolution individual edges to avoid label adhesion, and 3) bounding box-based matching (BBM) for eliminating false masks via considering comprehensive cluster boundary conditions to obtain a reliable output. In this way, pseudo labels generated by three backbones equipped with our EDGSP achieve 100% object-level probability of detection (Pd) and 0% false-alarm rate (Fa) on SIRST, NUDT-SIRST, and IRSTD-1k datasets, with a pixel-level intersection over union (IoU) improvement of 13.28% over state-of-the-art (SOTA) label generation methods. Further applying our inferred masks to train detection models, EDGSP, for the first time, enables a single-point-generated pseudo mask to surpass the manual labels. Even with coarse single-point annotations, it still achieves 99.5% performance of full labeling. Code is available at https://github.com/xdFai/EDGSP. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.08191v5-abstract-full').style.display = 'none'; document.getElementById('2408.08191v5-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Updated the title to better reflect the content of the paper</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.06402">arXiv:2408.06402</a> <span> [<a href="https://arxiv.org/pdf/2408.06402">pdf</a>, <a href="https://arxiv.org/format/2408.06402">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantitative Methods">q-bio.QM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> PhaGO: Protein function annotation for bacteriophages by integrating the genomic context </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guan%2C+J">Jiaojiao Guan</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+Y">Yongxin Ji</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+C">Cheng Peng</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+W">Wei Zou</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+X">Xubo Tang</a>, <a href="/search/cs?searchtype=author&query=Shang%2C+J">Jiayu Shang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yanni Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.06402v2-abstract-short" style="display: inline;"> Bacteriophages are viruses that target bacteria, playing a crucial role in microbial ecology. Phage proteins are important in understanding phage biology, such as virus infection, replication, and evolution. Although a large number of new phages have been identified via metagenomic sequencing, many of them have limited protein function annotation. Accurate function annotation of phage proteins pre… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.06402v2-abstract-full').style.display = 'inline'; document.getElementById('2408.06402v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.06402v2-abstract-full" style="display: none;"> Bacteriophages are viruses that target bacteria, playing a crucial role in microbial ecology. Phage proteins are important in understanding phage biology, such as virus infection, replication, and evolution. Although a large number of new phages have been identified via metagenomic sequencing, many of them have limited protein function annotation. Accurate function annotation of phage proteins presents several challenges, including their inherent diversity and the scarcity of annotated ones. Existing tools have yet to fully leverage the unique properties of phages in annotating protein functions. In this work, we propose a new protein function annotation tool for phages by leveraging the modular genomic structure of phage genomes. By employing embeddings from the latest protein foundation models and Transformer to capture contextual information between proteins in phage genomes, PhaGO surpasses state-of-the-art methods in annotating diverged proteins and proteins with uncommon functions by 6.78% and 13.05% improvement, respectively. PhaGO can annotate proteins lacking homology search results, which is critical for characterizing the rapidly accumulating phage genomes. We demonstrate the utility of PhaGO by identifying 688 potential holins in phages, which exhibit high structural conservation with known holins. The results show the potential of PhaGO to extend our understanding of newly discovered phages. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.06402v2-abstract-full').style.display = 'none'; document.getElementById('2408.06402v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages,6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.03519">arXiv:2408.03519</a> <span> [<a href="https://arxiv.org/pdf/2408.03519">pdf</a>, <a href="https://arxiv.org/format/2408.03519">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> RepoMasterEval: Evaluating Code Completion via Real-World Repositories </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+Q">Qinyun Wu</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+C">Chao Peng</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+P">Pengfei Gao</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+R">Ruida Hu</a>, <a href="/search/cs?searchtype=author&query=Gan%2C+H">Haoyu Gan</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+B">Bo Jiang</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+J">Jinhe Tang</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+Z">Zhiwen Deng</a>, <a href="/search/cs?searchtype=author&query=Guan%2C+Z">Zhanming Guan</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+C">Cuiyun Gao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xia Liu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+P">Ping Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.03519v1-abstract-short" style="display: inline;"> With the growing reliance on automated code completion tools in software development, the need for robust evaluation benchmarks has become critical. However, existing benchmarks focus more on code generation tasks in function and class level and provide rich text description to prompt the model. By contrast, such descriptive prompt is commonly unavailable in real development and code completion ca… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.03519v1-abstract-full').style.display = 'inline'; document.getElementById('2408.03519v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.03519v1-abstract-full" style="display: none;"> With the growing reliance on automated code completion tools in software development, the need for robust evaluation benchmarks has become critical. However, existing benchmarks focus more on code generation tasks in function and class level and provide rich text description to prompt the model. By contrast, such descriptive prompt is commonly unavailable in real development and code completion can occur in wider range of situations such as in the middle of a function or a code block. These limitations makes the evaluation poorly align with the practical scenarios of code completion tools. In this paper, we propose RepoMasterEval, a novel benchmark for evaluating code completion models constructed from real-world Python and TypeScript repositories. Each benchmark datum is generated by masking a code snippet (ground truth) from one source code file with existing test suites. To improve test accuracy of model generated code, we employ mutation testing to measure the effectiveness of the test cases and we manually crafted new test cases for those test suites with low mutation score. Our empirical evaluation on 6 state-of-the-art models shows that test argumentation is critical in improving the accuracy of the benchmark and RepoMasterEval is able to report difference in model performance in real-world scenarios. The deployment of RepoMasterEval in a collaborated company for one month also revealed that the benchmark is useful to give accurate feedback during model training and the score is in high correlation with the model's performance in practice. Based on our findings, we call for the software engineering community to build more LLM benchmarks tailored for code generation tools taking the practical and complex development environment into consideration. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.03519v1-abstract-full').style.display = 'none'; document.getElementById('2408.03519v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.01929">arXiv:2408.01929</a> <span> [<a href="https://arxiv.org/pdf/2408.01929">pdf</a>, <a href="https://arxiv.org/format/2408.01929">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Advancing H&E-to-IHC Stain Translation in Breast Cancer: A Multi-Magnification and Attention-Based Approach </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qu%2C+L">Linhao Qu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chengsheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Guihui Li</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+H">Haiyong Zheng</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+C">Chen Peng</a>, <a href="/search/cs?searchtype=author&query=He%2C+W">Wei He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.01929v1-abstract-short" style="display: inline;"> Breast cancer presents a significant healthcare challenge globally, demanding precise diagnostics and effective treatment strategies, where histopathological examination of Hematoxylin and Eosin (H&E) stained tissue sections plays a central role. Despite its importance, evaluating specific biomarkers like Human Epidermal Growth Factor Receptor 2 (HER2) for personalized treatment remains constraine… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.01929v1-abstract-full').style.display = 'inline'; document.getElementById('2408.01929v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.01929v1-abstract-full" style="display: none;"> Breast cancer presents a significant healthcare challenge globally, demanding precise diagnostics and effective treatment strategies, where histopathological examination of Hematoxylin and Eosin (H&E) stained tissue sections plays a central role. Despite its importance, evaluating specific biomarkers like Human Epidermal Growth Factor Receptor 2 (HER2) for personalized treatment remains constrained by the resource-intensive nature of Immunohistochemistry (IHC). Recent strides in deep learning, particularly in image-to-image translation, offer promise in synthesizing IHC-HER2 slides from H\&E stained slides. However, existing methodologies encounter challenges, including managing multiple magnifications in pathology images and insufficient focus on crucial information during translation. To address these issues, we propose a novel model integrating attention mechanisms and multi-magnification information processing. Our model employs a multi-magnification processing strategy to extract and utilize information from various magnifications within pathology images, facilitating robust image translation. Additionally, an attention module within the generative network prioritizes critical information for image distribution translation while minimizing less pertinent details. Rigorous testing on a publicly available breast cancer dataset demonstrates superior performance compared to existing methods, establishing our model as a state-of-the-art solution in advancing pathology image translation from H&E to IHC staining. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.01929v1-abstract-full').style.display = 'none'; document.getElementById('2408.01929v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by IEEE CIS-RAM 2024 Invited Session Oral</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.01928">arXiv:2408.01928</a> <span> [<a href="https://arxiv.org/pdf/2408.01928">pdf</a>, <a href="https://arxiv.org/format/2408.01928">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> A Semi-supervised Multi-channel Graph Convolutional Network for Query Classification in E-commerce </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yuan%2C+C">Chunyuan Yuan</a>, <a href="/search/cs?searchtype=author&query=Pang%2C+M">Ming Pang</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Z">Zheng Fang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+X">Xue Jiang</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+C">Changping Peng</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Z">Zhangang Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.01928v1-abstract-short" style="display: inline;"> Query intent classification is an essential module for customers to find desired products on the e-commerce application quickly. Most existing query intent classification methods rely on the users' click behavior as a supervised signal to construct training samples. However, these methods based entirely on posterior labels may lead to serious category imbalance problems because of the Matthew effe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.01928v1-abstract-full').style.display = 'inline'; document.getElementById('2408.01928v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.01928v1-abstract-full" style="display: none;"> Query intent classification is an essential module for customers to find desired products on the e-commerce application quickly. Most existing query intent classification methods rely on the users' click behavior as a supervised signal to construct training samples. However, these methods based entirely on posterior labels may lead to serious category imbalance problems because of the Matthew effect in click samples. Compared with popular categories, it is difficult for products under long-tail categories to obtain traffic and user clicks, which makes the models unable to detect users' intent for products under long-tail categories. This in turn aggravates the problem that long-tail categories cannot obtain traffic, forming a vicious circle. In addition, due to the randomness of the user's click, the posterior label is unstable for the query with similar semantics, which makes the model very sensitive to the input, leading to an unstable and incomplete recall of categories. In this paper, we propose a novel Semi-supervised Multi-channel Graph Convolutional Network (SMGCN) to address the above problems from the perspective of label association and semi-supervised learning. SMGCN extends category information and enhances the posterior label by utilizing the similarity score between the query and categories. Furthermore, it leverages the co-occurrence and semantic similarity graph of categories to strengthen the relations among labels and weaken the influence of posterior label instability. We conduct extensive offline and online A/B experiments, and the experimental results show that SMGCN significantly outperforms the strong baselines, which shows its effectiveness and practicality. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.01928v1-abstract-full').style.display = 'none'; document.getElementById('2408.01928v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by WWW2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.00766">arXiv:2408.00766</a> <span> [<a href="https://arxiv.org/pdf/2408.00766">pdf</a>, <a href="https://arxiv.org/format/2408.00766">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Optimizing Diffusion Models for Joint Trajectory Prediction and Controllable Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yixiao Wang</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+C">Chen Tang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+L">Lingfeng Sun</a>, <a href="/search/cs?searchtype=author&query=Rossi%2C+S">Simone Rossi</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+Y">Yichen Xie</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+C">Chensheng Peng</a>, <a href="/search/cs?searchtype=author&query=Hannagan%2C+T">Thomas Hannagan</a>, <a href="/search/cs?searchtype=author&query=Sabatini%2C+S">Stefano Sabatini</a>, <a href="/search/cs?searchtype=author&query=Poerio%2C+N">Nicola Poerio</a>, <a href="/search/cs?searchtype=author&query=Tomizuka%2C+M">Masayoshi Tomizuka</a>, <a href="/search/cs?searchtype=author&query=Zhan%2C+W">Wei Zhan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.00766v1-abstract-short" style="display: inline;"> Diffusion models are promising for joint trajectory prediction and controllable generation in autonomous driving, but they face challenges of inefficient inference steps and high computational demands. To tackle these challenges, we introduce Optimal Gaussian Diffusion (OGD) and Estimated Clean Manifold (ECM) Guidance. OGD optimizes the prior distribution for a small diffusion time $T$ and starts… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.00766v1-abstract-full').style.display = 'inline'; document.getElementById('2408.00766v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.00766v1-abstract-full" style="display: none;"> Diffusion models are promising for joint trajectory prediction and controllable generation in autonomous driving, but they face challenges of inefficient inference steps and high computational demands. To tackle these challenges, we introduce Optimal Gaussian Diffusion (OGD) and Estimated Clean Manifold (ECM) Guidance. OGD optimizes the prior distribution for a small diffusion time $T$ and starts the reverse diffusion process from it. ECM directly injects guidance gradients to the estimated clean manifold, eliminating extensive gradient backpropagation throughout the network. Our methodology streamlines the generative process, enabling practical applications with reduced computational overhead. Experimental validation on the large-scale Argoverse 2 dataset demonstrates our approach's superior performance, offering a viable solution for computationally efficient, high-quality joint trajectory prediction and controllable generation for autonomous driving. Our project webpage is at https://yixiaowang7.github.io/OptTrajDiff_Page/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.00766v1-abstract-full').style.display = 'none'; document.getElementById('2408.00766v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">30 pages, 20 figures, Accepted to ECCV 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.17786">arXiv:2407.17786</a> <span> [<a href="https://arxiv.org/pdf/2407.17786">pdf</a>, <a href="https://arxiv.org/format/2407.17786">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> Topology-Preserving Downsampling of Binary Images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+C">Chia-Chia Chen</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+C">Chi-Han Peng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.17786v1-abstract-short" style="display: inline;"> We present a novel discrete optimization-based approach to generate downsampled versions of binary images that are guaranteed to have the same topology as the original, measured by the zeroth and first Betti numbers of the black regions, while having good similarity to the original image as measured by IoU and Dice scores. To our best knowledge, all existing binary image downsampling methods do no… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.17786v1-abstract-full').style.display = 'inline'; document.getElementById('2407.17786v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.17786v1-abstract-full" style="display: none;"> We present a novel discrete optimization-based approach to generate downsampled versions of binary images that are guaranteed to have the same topology as the original, measured by the zeroth and first Betti numbers of the black regions, while having good similarity to the original image as measured by IoU and Dice scores. To our best knowledge, all existing binary image downsampling methods do not have such topology-preserving guarantees. We also implemented a baseline morphological operation (dilation)-based approach that always generates topologically correct results. However, we found the similarity scores to be much worse. We demonstrate several applications of our approach. First, generating smaller versions of medical image segmentation masks for easier human inspection. Second, improving the efficiency of binary image operations, including persistent homology computation and shortest path computation, by substituting the original images with smaller ones. In particular, the latter is a novel application that is made feasible only by the full topology-preservation guarantee of our method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.17786v1-abstract-full').style.display = 'none'; document.getElementById('2407.17786v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to The 18th European Conference on Computer Vision (ECCV) 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.15359">arXiv:2407.15359</a> <span> [<a href="https://arxiv.org/pdf/2407.15359">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> UF-HOBI at "Discharge Me!": A Hybrid Solution for Discharge Summary Generation Through Prompt-based Tuning of GatorTronGPT Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lyu%2C+M">Mengxian Lyu</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+C">Cheng Peng</a>, <a href="/search/cs?searchtype=author&query=Paredes%2C+D">Daniel Paredes</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Ziyi Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+A">Aokun Chen</a>, <a href="/search/cs?searchtype=author&query=Bian%2C+J">Jiang Bian</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yonghui Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.15359v1-abstract-short" style="display: inline;"> Automatic generation of discharge summaries presents significant challenges due to the length of clinical documentation, the dispersed nature of patient information, and the diverse terminology used in healthcare. This paper presents a hybrid solution for generating discharge summary sections as part of our participation in the "Discharge Me!" Challenge at the BioNLP 2024 Shared Task. We developed… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.15359v1-abstract-full').style.display = 'inline'; document.getElementById('2407.15359v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.15359v1-abstract-full" style="display: none;"> Automatic generation of discharge summaries presents significant challenges due to the length of clinical documentation, the dispersed nature of patient information, and the diverse terminology used in healthcare. This paper presents a hybrid solution for generating discharge summary sections as part of our participation in the "Discharge Me!" Challenge at the BioNLP 2024 Shared Task. We developed a two-stage generation method using both extractive and abstractive techniques, in which we first apply name entity recognition (NER) to extract key clinical concepts, which are then used as input for a prompt-tuning-based GatorTronGPT model to generate coherent text for two important sections including "Brief Hospital Course" and "Discharge Instructions". Our system was ranked 5th in this challenge, achieving an overall score of 0.284. The results demonstrate the effectiveness of our hybrid solution in improving the quality of automated discharge section generation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.15359v1-abstract-full').style.display = 'none'; document.getElementById('2407.15359v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">BIONLP 2024 and Shared Tasks @ ACL 2024</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> BIONLP 2024 and Shared Tasks @ ACL 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.14367">arXiv:2407.14367</a> <span> [<a href="https://arxiv.org/pdf/2407.14367">pdf</a>, <a href="https://arxiv.org/format/2407.14367">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Thinking Racial Bias in Fair Forgery Detection: Models, Datasets and Evaluations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+D">Decheng Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zongqi Wang</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+C">Chunlei Peng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+N">Nannan Wang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+R">Ruimin Hu</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+X">Xinbo Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.14367v2-abstract-short" style="display: inline;"> Due to the successful development of deep image generation technology, forgery detection plays a more important role in social and economic security. Racial bias has not been explored thoroughly in the deep forgery detection field. In the paper, we first contribute a dedicated dataset called the Fair Forgery Detection (FairFD) dataset, where we prove the racial bias of public state-of-the-art (SOT… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.14367v2-abstract-full').style.display = 'inline'; document.getElementById('2407.14367v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.14367v2-abstract-full" style="display: none;"> Due to the successful development of deep image generation technology, forgery detection plays a more important role in social and economic security. Racial bias has not been explored thoroughly in the deep forgery detection field. In the paper, we first contribute a dedicated dataset called the Fair Forgery Detection (FairFD) dataset, where we prove the racial bias of public state-of-the-art (SOTA) methods. Different from existing forgery detection datasets, the self-constructed FairFD dataset contains a balanced racial ratio and diverse forgery generation images with the largest-scale subjects. Additionally, we identify the problems with naive fairness metrics when benchmarking forgery detection models. To comprehensively evaluate fairness, we design novel metrics including Approach Averaged Metric and Utility Regularized Metric, which can avoid deceptive results. We also present an effective and robust post-processing technique, Bias Pruning with Fair Activations (BPFA), which improves fairness without requiring retraining or weight updates. Extensive experiments conducted with 12 representative forgery detection models demonstrate the value of the proposed dataset and the reasonability of the designed fairness metrics. By applying the BPFA to the existing fairest detector, we achieve a new SOTA. Furthermore, we conduct more in-depth analyses to offer more insights to inspire researchers in the community. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.14367v2-abstract-full').style.display = 'none'; document.getElementById('2407.14367v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.09874">arXiv:2407.09874</a> <span> [<a href="https://arxiv.org/pdf/2407.09874">pdf</a>, <a href="https://arxiv.org/format/2407.09874">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> SeFi-CD: A Semantic First Change Detection Paradigm That Can Detect Any Change You Want </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+L">Ling Zhao</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Z">Zhenyang Huang</a>, <a href="/search/cs?searchtype=author&query=Kuang%2C+D">Dongsheng Kuang</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+C">Chengli Peng</a>, <a href="/search/cs?searchtype=author&query=Gan%2C+J">Jun Gan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Haifeng Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.09874v1-abstract-short" style="display: inline;"> The existing change detection(CD) methods can be summarized as the visual-first change detection (ViFi-CD) paradigm, which first extracts change features from visual differences and then assigns them specific semantic information. However, CD is essentially dependent on change regions of interest (CRoIs), meaning that the CD results are directly determined by the semantics changes of interest, mak… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.09874v1-abstract-full').style.display = 'inline'; document.getElementById('2407.09874v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.09874v1-abstract-full" style="display: none;"> The existing change detection(CD) methods can be summarized as the visual-first change detection (ViFi-CD) paradigm, which first extracts change features from visual differences and then assigns them specific semantic information. However, CD is essentially dependent on change regions of interest (CRoIs), meaning that the CD results are directly determined by the semantics changes of interest, making its primary image factor semantic of interest rather than visual. The ViFi-CD paradigm can only assign specific semantics of interest to specific change features extracted from visual differences, leading to the inevitable omission of potential CRoIs and the inability to adapt to different CRoI CD tasks. In other words, changes in other CRoIs cannot be detected by the ViFi-CD method without retraining the model or significantly modifying the method. This paper introduces a new CD paradigm, the semantic-first CD (SeFi-CD) paradigm. The core idea of SeFi-CD is to first perceive the dynamic semantics of interest and then visually search for change features related to the semantics. Based on the SeFi-CD paradigm, we designed Anything You Want Change Detection (AUWCD). Experiments on public datasets demonstrate that the AUWCD outperforms the current state-of-the-art CD methods, achieving an average F1 score 5.01\% higher than that of these advanced supervised baselines on the SECOND dataset, with a maximum increase of 13.17\%. The proposed SeFi-CD offers a novel CD perspective and approach. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.09874v1-abstract-full').style.display = 'none'; document.getElementById('2407.09874v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.09777">arXiv:2407.09777</a> <span> [<a href="https://arxiv.org/pdf/2407.09777">pdf</a>, <a href="https://arxiv.org/format/2407.09777">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Graph Transformers: A Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shehzad%2C+A">Ahsan Shehzad</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+F">Feng Xia</a>, <a href="/search/cs?searchtype=author&query=Abid%2C+S">Shagufta Abid</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+C">Ciyuan Peng</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+S">Shuo Yu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+D">Dongyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Verspoor%2C+K">Karin Verspoor</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.09777v1-abstract-short" style="display: inline;"> Graph transformers are a recent advancement in machine learning, offering a new class of neural network models for graph-structured data. The synergy between transformers and graph learning demonstrates strong performance and versatility across various graph-related tasks. This survey provides an in-depth review of recent progress and challenges in graph transformer research. We begin with foundat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.09777v1-abstract-full').style.display = 'inline'; document.getElementById('2407.09777v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.09777v1-abstract-full" style="display: none;"> Graph transformers are a recent advancement in machine learning, offering a new class of neural network models for graph-structured data. The synergy between transformers and graph learning demonstrates strong performance and versatility across various graph-related tasks. This survey provides an in-depth review of recent progress and challenges in graph transformer research. We begin with foundational concepts of graphs and transformers. We then explore design perspectives of graph transformers, focusing on how they integrate graph inductive biases and graph attention mechanisms into the transformer architecture. Furthermore, we propose a taxonomy classifying graph transformers based on depth, scalability, and pre-training strategies, summarizing key principles for effective development of graph transformer models. Beyond technical analysis, we discuss the applications of graph transformer models for node-level, edge-level, and graph-level tasks, exploring their potential in other application scenarios as well. Finally, we identify remaining challenges in the field, such as scalability and efficiency, generalization and robustness, interpretability and explainability, dynamic and complex graphs, as well as data quality and diversity, charting future directions for graph transformer research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.09777v1-abstract-full').style.display = 'none'; document.getElementById('2407.09777v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">23 pages, 4 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68T07; 68T05; 68U01 <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.6 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.06366">arXiv:2407.06366</a> <span> [<a href="https://arxiv.org/pdf/2407.06366">pdf</a>, <a href="https://arxiv.org/format/2407.06366">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Data Structures and Algorithms">cs.DS</span> </div> </div> <p class="title is-5 mathjax"> Stochastic Traveling Salesperson Problem with Neighborhoods for Object Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Peng%2C+C">Cheng Peng</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+M">Minghan Wei</a>, <a href="/search/cs?searchtype=author&query=Isler%2C+V">Volkan Isler</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.06366v1-abstract-short" style="display: inline;"> We introduce a new route-finding problem which considers perception and travel costs simultaneously. Specifically, we consider the problem of finding the shortest tour such that all objects of interest can be detected successfully. To represent a viable detection region for each object, we propose to use an entropy-based viewing score that generates a diameter-bounded region as a viewing neighborh… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.06366v1-abstract-full').style.display = 'inline'; document.getElementById('2407.06366v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.06366v1-abstract-full" style="display: none;"> We introduce a new route-finding problem which considers perception and travel costs simultaneously. Specifically, we consider the problem of finding the shortest tour such that all objects of interest can be detected successfully. To represent a viable detection region for each object, we propose to use an entropy-based viewing score that generates a diameter-bounded region as a viewing neighborhood. We formulate the detection-based trajectory planning problem as a stochastic traveling salesperson problem with neighborhoods and propose a center-visit method that obtains an approximation ratio of O(DmaxDmin) for disjoint regions. For non-disjoint regions, our method -provides a novel finite detour in 3D, which utilizes the region's minimum curvature property. Finally, we show that our method can generate efficient trajectories compared to a baseline method in a photo-realistic simulation environment. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.06366v1-abstract-full').style.display = 'none'; document.getElementById('2407.06366v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">2023 IEEE International Conference on Robotics and Automation</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.19804">arXiv:2406.19804</a> <span> [<a href="https://arxiv.org/pdf/2406.19804">pdf</a>, <a href="https://arxiv.org/ps/2406.19804">ps</a>, <a href="https://arxiv.org/format/2406.19804">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Rateless Stochastic Coding for Delay-constrained Semantic Communication </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Peng%2C+C">Cheng Peng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+R">Rulong Wang</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+Y">Yong Xiao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.19804v1-abstract-short" style="display: inline;"> We consider the problem of joint source-channel coding with distortion and perception constraints from a rateless perspective, the purpose of which is to settle the balance between reliability (distortion/perception) and effectiveness (rate) of transmission over uncertain channels. We find a new finite-blocklength bound for the achievable joint source-channel code rate with the above two constrain… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.19804v1-abstract-full').style.display = 'inline'; document.getElementById('2406.19804v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.19804v1-abstract-full" style="display: none;"> We consider the problem of joint source-channel coding with distortion and perception constraints from a rateless perspective, the purpose of which is to settle the balance between reliability (distortion/perception) and effectiveness (rate) of transmission over uncertain channels. We find a new finite-blocklength bound for the achievable joint source-channel code rate with the above two constraints. To achieve a superior rateless characteristic of JSCC coding, we perform multi-level optimization on various finite-blocklength codes. Based on these two, we then propose a new JSCC coding scheme called rateless stochastic coding (RSC). We experimentally demonstrate that the proposed RSC can achieve variable rates of transmission maintaining an excellent trade-off between distortion and perception. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.19804v1-abstract-full').style.display = 'none'; document.getElementById('2406.19804v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.11145">arXiv:2406.11145</a> <span> [<a href="https://arxiv.org/pdf/2406.11145">pdf</a>, <a href="https://arxiv.org/format/2406.11145">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Federated Face Forgery Detection Learning with Personalized Representation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+D">Decheng Liu</a>, <a href="/search/cs?searchtype=author&query=Dang%2C+Z">Zhan Dang</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+C">Chunlei Peng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+N">Nannan Wang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+R">Ruimin Hu</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+X">Xinbo Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.11145v1-abstract-short" style="display: inline;"> Deep generator technology can produce high-quality fake videos that are indistinguishable, posing a serious social threat. Traditional forgery detection methods directly centralized training on data and lacked consideration of information sharing in non-public video data scenarios and data privacy. Naturally, the federated learning strategy can be applied for privacy protection, which aggregates m… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.11145v1-abstract-full').style.display = 'inline'; document.getElementById('2406.11145v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.11145v1-abstract-full" style="display: none;"> Deep generator technology can produce high-quality fake videos that are indistinguishable, posing a serious social threat. Traditional forgery detection methods directly centralized training on data and lacked consideration of information sharing in non-public video data scenarios and data privacy. Naturally, the federated learning strategy can be applied for privacy protection, which aggregates model parameters of clients but not original data. However, simple federated learning can't achieve satisfactory performance because of poor generalization capabilities for the real hybrid-domain forgery dataset. To solve the problem, the paper proposes a novel federated face forgery detection learning with personalized representation. The designed Personalized Forgery Representation Learning aims to learn the personalized representation of each client to improve the detection performance of individual client models. In addition, a personalized federated learning training strategy is utilized to update the parameters of the distributed detection model. Here collaborative training is conducted on multiple distributed client devices, and shared representations of these client models are uploaded to the server side for aggregation. Experiments on several public face forgery detection datasets demonstrate the superior performance of the proposed algorithm compared with state-of-the-art methods. The code is available at \emph{https://github.com/GANG370/PFR-Forgery.} <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.11145v1-abstract-full').style.display = 'none'; document.getElementById('2406.11145v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The code is publicly available</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.10933">arXiv:2406.10933</a> <span> [<a href="https://arxiv.org/pdf/2406.10933">pdf</a>, <a href="https://arxiv.org/format/2406.10933">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Improving Adversarial Robustness via Decoupled Visual Representation Masking </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+D">Decheng Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tao Chen</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+C">Chunlei Peng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+N">Nannan Wang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+R">Ruimin Hu</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+X">Xinbo Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.10933v1-abstract-short" style="display: inline;"> Deep neural networks are proven to be vulnerable to fine-designed adversarial examples, and adversarial defense algorithms draw more and more attention nowadays. Pre-processing based defense is a major strategy, as well as learning robust feature representation has been proven an effective way to boost generalization. However, existing defense works lack considering different depth-level visual fe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.10933v1-abstract-full').style.display = 'inline'; document.getElementById('2406.10933v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.10933v1-abstract-full" style="display: none;"> Deep neural networks are proven to be vulnerable to fine-designed adversarial examples, and adversarial defense algorithms draw more and more attention nowadays. Pre-processing based defense is a major strategy, as well as learning robust feature representation has been proven an effective way to boost generalization. However, existing defense works lack considering different depth-level visual features in the training process. In this paper, we first highlight two novel properties of robust features from the feature distribution perspective: 1) \textbf{Diversity}. The robust feature of intra-class samples can maintain appropriate diversity; 2) \textbf{Discriminability}. The robust feature of inter-class samples should ensure adequate separation. We find that state-of-the-art defense methods aim to address both of these mentioned issues well. It motivates us to increase intra-class variance and decrease inter-class discrepancy simultaneously in adversarial training. Specifically, we propose a simple but effective defense based on decoupled visual representation masking. The designed Decoupled Visual Feature Masking (DFM) block can adaptively disentangle visual discriminative features and non-visual features with diverse mask strategies, while the suitable discarding information can disrupt adversarial noise to improve robustness. Our work provides a generic and easy-to-plugin block unit for any former adversarial training algorithm to achieve better protection integrally. Extensive experimental results prove the proposed method can achieve superior performance compared with state-of-the-art defense approaches. The code is publicly available at \href{https://github.com/chenboluo/Adversarial-defense}{https://github.com/chenboluo/Adversarial-defense}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.10933v1-abstract-full').style.display = 'none'; document.getElementById('2406.10933v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The code is publicly available</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.10887">arXiv:2406.10887</a> <span> [<a href="https://arxiv.org/pdf/2406.10887">pdf</a>, <a href="https://arxiv.org/format/2406.10887">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Imperceptible Face Forgery Attack via Adversarial Semantic Mask </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+D">Decheng Liu</a>, <a href="/search/cs?searchtype=author&query=Su%2C+Q">Qixuan Su</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+C">Chunlei Peng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+N">Nannan Wang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+X">Xinbo Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.10887v1-abstract-short" style="display: inline;"> With the great development of generative model techniques, face forgery detection draws more and more attention in the related field. Researchers find that existing face forgery models are still vulnerable to adversarial examples with generated pixel perturbations in the global image. These generated adversarial samples still can't achieve satisfactory performance because of the high detectability… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.10887v1-abstract-full').style.display = 'inline'; document.getElementById('2406.10887v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.10887v1-abstract-full" style="display: none;"> With the great development of generative model techniques, face forgery detection draws more and more attention in the related field. Researchers find that existing face forgery models are still vulnerable to adversarial examples with generated pixel perturbations in the global image. These generated adversarial samples still can't achieve satisfactory performance because of the high detectability. To address these problems, we propose an Adversarial Semantic Mask Attack framework (ASMA) which can generate adversarial examples with good transferability and invisibility. Specifically, we propose a novel adversarial semantic mask generative model, which can constrain generated perturbations in local semantic regions for good stealthiness. The designed adaptive semantic mask selection strategy can effectively leverage the class activation values of different semantic regions, and further ensure better attack transferability and stealthiness. Extensive experiments on the public face forgery dataset prove the proposed method achieves superior performance compared with several representative adversarial attack methods. The code is publicly available at https://github.com/clawerO-O/ASMA. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.10887v1-abstract-full').style.display = 'none'; document.getElementById('2406.10887v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The code is publicly available</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.05988">arXiv:2406.05988</a> <span> [<a href="https://arxiv.org/pdf/2406.05988">pdf</a>, <a href="https://arxiv.org/format/2406.05988">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> </div> </div> <p class="title is-5 mathjax"> Sponsored Search Auction Design Beyond Single Utility Maximization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+C">Changfeng Xu</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+C">Chao Peng</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+C">Chenyang Xu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhengfeng Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.05988v1-abstract-short" style="display: inline;"> Auction design for the modern advertising market has gained significant prominence in the field of game theory. With the recent rise of auto-bidding tools, an increasing number of advertisers in the market are utilizing these tools for auctions. The diverse array of auto-bidding tools has made auction design more challenging. Various types of bidders, such as quasi-linear utility maximizers and co… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.05988v1-abstract-full').style.display = 'inline'; document.getElementById('2406.05988v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.05988v1-abstract-full" style="display: none;"> Auction design for the modern advertising market has gained significant prominence in the field of game theory. With the recent rise of auto-bidding tools, an increasing number of advertisers in the market are utilizing these tools for auctions. The diverse array of auto-bidding tools has made auction design more challenging. Various types of bidders, such as quasi-linear utility maximizers and constrained value maximizers, coexist within this dynamic gaming environment. We study sponsored search auction design in such a mixed-bidder world and aim to design truthful mechanisms that maximize the total social welfare. To simultaneously capture the classical utility and the value-max utility, we introduce an allowance utility model. In this model, each bidder is endowed with an additional allowance parameter, signifying the threshold up to which the bidder can maintain a value-max strategy. The paper distinguishes two settings based on the accessibility of the allowance information. In the case where each bidder's allowance is public, we demonstrate the existence of a truthful mechanism achieving an approximation ratio of $(1+蔚)$ for any $蔚> 0$. In the more challenging private allowance setting, we establish that a truthful mechanism can achieve a constant approximation. Further, we consider uniform-price auction design in large markets and give a truthful mechanism that sets a uniform price in a random manner and admits bounded approximation in expectation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.05988v1-abstract-full').style.display = 'none'; document.getElementById('2406.05988v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To appear in COCOON 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.00148">arXiv:2406.00148</a> <span> [<a href="https://arxiv.org/pdf/2406.00148">pdf</a>, <a href="https://arxiv.org/format/2406.00148">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Data Structures and Algorithms">cs.DS</span> </div> </div> <p class="title is-5 mathjax"> Submodular Maximization in Exactly $n$ Queries </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Balkanski%2C+E">Eric Balkanski</a>, <a href="/search/cs?searchtype=author&query=DiSilvio%2C+S">Steven DiSilvio</a>, <a href="/search/cs?searchtype=author&query=Kuhnle%2C+A">Alan Kuhnle</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+C">ChunLi Peng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.00148v2-abstract-short" style="display: inline;"> In this work, we study the classical problem of maximizing a submodular function subject to a matroid constraint. We develop deterministic algorithms that are very parsimonious with respect to querying the submodular function, for both the case when the submodular function is monotone and the general submodular case. In particular, we present a 1/4 approximation algorithm for the monotone case tha… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.00148v2-abstract-full').style.display = 'inline'; document.getElementById('2406.00148v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.00148v2-abstract-full" style="display: none;"> In this work, we study the classical problem of maximizing a submodular function subject to a matroid constraint. We develop deterministic algorithms that are very parsimonious with respect to querying the submodular function, for both the case when the submodular function is monotone and the general submodular case. In particular, we present a 1/4 approximation algorithm for the monotone case that uses exactly one query per element, which gives the same total number of queries n as the number of queries required to compute the maximum singleton. For the general case, we present a constant factor approximation algorithm that requires 2 queries per element, which is the first algorithm for this problem with linear query complexity in the size of the ground set. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.00148v2-abstract-full').style.display = 'none'; document.getElementById('2406.00148v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The same algorithm and 1/4 approximation result for the monotone case were previously obtained by Dutting et al. [14]. At the time of writing, we were not aware of this other paper. We generalize the algorithm to the p-matchoid constraints. Due to the base algorithm for the monotone case being identical as in Dutting et al. [14], we view the technical contribution of this manuscript limited</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.18784">arXiv:2405.18784</a> <span> [<a href="https://arxiv.org/pdf/2405.18784">pdf</a>, <a href="https://arxiv.org/format/2405.18784">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> LP-3DGS: Learning to Prune 3D Gaussian Splatting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhaoliang Zhang</a>, <a href="/search/cs?searchtype=author&query=Song%2C+T">Tianchen Song</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+Y">Yongjae Lee</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+L">Li Yang</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+C">Cheng Peng</a>, <a href="/search/cs?searchtype=author&query=Chellappa%2C+R">Rama Chellappa</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+D">Deliang Fan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.18784v1-abstract-short" style="display: inline;"> Recently, 3D Gaussian Splatting (3DGS) has become one of the mainstream methodologies for novel view synthesis (NVS) due to its high quality and fast rendering speed. However, as a point-based scene representation, 3DGS potentially generates a large number of Gaussians to fit the scene, leading to high memory usage. Improvements that have been proposed require either an empirical and preset prunin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.18784v1-abstract-full').style.display = 'inline'; document.getElementById('2405.18784v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.18784v1-abstract-full" style="display: none;"> Recently, 3D Gaussian Splatting (3DGS) has become one of the mainstream methodologies for novel view synthesis (NVS) due to its high quality and fast rendering speed. However, as a point-based scene representation, 3DGS potentially generates a large number of Gaussians to fit the scene, leading to high memory usage. Improvements that have been proposed require either an empirical and preset pruning ratio or importance score threshold to prune the point cloud. Such hyperparamter requires multiple rounds of training to optimize and achieve the maximum pruning ratio, while maintaining the rendering quality for each scene. In this work, we propose learning-to-prune 3DGS (LP-3DGS), where a trainable binary mask is applied to the importance score that can find optimal pruning ratio automatically. Instead of using the traditional straight-through estimator (STE) method to approximate the binary mask gradient, we redesign the masking function to leverage the Gumbel-Sigmoid method, making it differentiable and compatible with the existing training process of 3DGS. Extensive experiments have shown that LP-3DGS consistently produces a good balance that is both efficient and high quality. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.18784v1-abstract-full').style.display = 'none'; document.getElementById('2405.18784v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.05202">arXiv:2405.05202</a> <span> [<a href="https://arxiv.org/pdf/2405.05202">pdf</a>, <a href="https://arxiv.org/format/2405.05202">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Data Structures and Algorithms">cs.DS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Discrete Mathematics">cs.DM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Discretely Beyond $1/e$: Guided Combinatorial Algorithms for Submodular Maximization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yixin Chen</a>, <a href="/search/cs?searchtype=author&query=Nath%2C+A">Ankur Nath</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+C">Chunli Peng</a>, <a href="/search/cs?searchtype=author&query=Kuhnle%2C+A">Alan Kuhnle</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.05202v2-abstract-short" style="display: inline;"> For constrained, not necessarily monotone submodular maximization, all known approximation algorithms with ratio greater than $1/e$ require continuous ideas, such as queries to the multilinear extension of a submodular function and its gradient, which are typically expensive to simulate with the original set function. For combinatorial algorithms, the best known approximation ratios for both size… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.05202v2-abstract-full').style.display = 'inline'; document.getElementById('2405.05202v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.05202v2-abstract-full" style="display: none;"> For constrained, not necessarily monotone submodular maximization, all known approximation algorithms with ratio greater than $1/e$ require continuous ideas, such as queries to the multilinear extension of a submodular function and its gradient, which are typically expensive to simulate with the original set function. For combinatorial algorithms, the best known approximation ratios for both size and matroid constraint are obtained by a simple randomized greedy algorithm of Buchbinder et al. [9]: $1/e \approx 0.367$ for size constraint and $0.281$ for the matroid constraint in $\mathcal O (kn)$ queries, where $k$ is the rank of the matroid. In this work, we develop the first combinatorial algorithms to break the $1/e$ barrier: we obtain approximation ratio of $0.385$ in $\mathcal O (kn)$ queries to the submodular set function for size constraint, and $0.305$ for a general matroid constraint. These are achieved by guiding the randomized greedy algorithm with a fast local search algorithm. Further, we develop deterministic versions of these algorithms, maintaining the same ratio and asymptotic time complexity. Finally, we develop a deterministic, nearly linear time algorithm with ratio $0.377$. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.05202v2-abstract-full').style.display = 'none'; document.getElementById('2405.05202v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.04111">arXiv:2405.04111</a> <span> [<a href="https://arxiv.org/pdf/2405.04111">pdf</a>, <a href="https://arxiv.org/format/2405.04111">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Adaptive Least Mean pth Power Graph Neural Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yan%2C+Y">Yi Yan</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+C">Changran Peng</a>, <a href="/search/cs?searchtype=author&query=Kuruoglu%2C+E+E">Ercan E. Kuruoglu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.04111v2-abstract-short" style="display: inline;"> In the presence of impulsive noise, and missing observations, accurate online prediction of time-varying graph signals poses a crucial challenge in numerous application domains. We propose the Adaptive Least Mean $p^{th}$ Power Graph Neural Networks (LMP-GNN), a universal framework combining adaptive filter and graph neural network for online graph signal estimation. LMP-GNN retains the advantage… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.04111v2-abstract-full').style.display = 'inline'; document.getElementById('2405.04111v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.04111v2-abstract-full" style="display: none;"> In the presence of impulsive noise, and missing observations, accurate online prediction of time-varying graph signals poses a crucial challenge in numerous application domains. We propose the Adaptive Least Mean $p^{th}$ Power Graph Neural Networks (LMP-GNN), a universal framework combining adaptive filter and graph neural network for online graph signal estimation. LMP-GNN retains the advantage of adaptive filtering in handling noise and missing observations as well as the online update capability. The incorporated graph neural network within the LMP-GNN can train and update filter parameters online instead of predefined filter parameters in previous methods, outputting more accurate prediction results. The adaptive update scheme of the LMP-GNN follows the solution of a $l_p$-norm optimization, rooting to the minimum dispersion criterion, and yields robust estimation results for time-varying graph signals under impulsive noise. A special case of LMP-GNN named the Sign-GNN is also provided and analyzed, Experiment results on two real-world datasets of temperature graph and traffic graph under four different noise distributions prove the effectiveness and robustness of our proposed LMP-GNN. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.04111v2-abstract-full').style.display = 'none'; document.getElementById('2405.04111v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.03372">arXiv:2405.03372</a> <span> [<a href="https://arxiv.org/pdf/2405.03372">pdf</a>, <a href="https://arxiv.org/format/2405.03372">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Snake Learning: A Communication- and Computation-Efficient Distributed Learning Framework for 6G </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+X">Xiaoxue Yu</a>, <a href="/search/cs?searchtype=author&query=Yi%2C+X">Xingfu Yi</a>, <a href="/search/cs?searchtype=author&query=Li%2C+R">Rongpeng Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+F">Fei Wang</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+C">Chenghui Peng</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Zhifeng Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Honggang Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.03372v1-abstract-short" style="display: inline;"> In the evolution towards 6G, integrating Artificial Intelligence (AI) with advanced network infrastructure emerges as a pivotal strategy for enhancing network intelligence and resource utilization. Existing distributed learning frameworks like Federated Learning and Split Learning often struggle with significant challenges in dynamic network environments including high synchronization demands, cos… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.03372v1-abstract-full').style.display = 'inline'; document.getElementById('2405.03372v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.03372v1-abstract-full" style="display: none;"> In the evolution towards 6G, integrating Artificial Intelligence (AI) with advanced network infrastructure emerges as a pivotal strategy for enhancing network intelligence and resource utilization. Existing distributed learning frameworks like Federated Learning and Split Learning often struggle with significant challenges in dynamic network environments including high synchronization demands, costly communication overheads, severe computing resource consumption, and data heterogeneity across network nodes. These obstacles hinder the applications of ubiquitous computing capabilities of 6G networks, especially in light of the trend of escalating model parameters and training data volumes. To address these challenges effectively, this paper introduces "Snake Learning", a cost-effective distributed learning framework. Specifically, Snake Learning respects the heterogeneity of inter-node computing capability and local data distribution in 6G networks, and sequentially trains the designated part of model layers on individual nodes. This layer-by-layer serpentine update mechanism contributes to significantly reducing the requirements for storage, memory and communication during the model training phase, and demonstrates superior adaptability and efficiency for both Computer Vision (CV) training and Large Language Model (LLM) fine-tuning tasks across homogeneous and heterogeneous data distributions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.03372v1-abstract-full').style.display = 'none'; document.getElementById('2405.03372v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.19358">arXiv:2404.19358</a> <span> [<a href="https://arxiv.org/pdf/2404.19358">pdf</a>, <a href="https://arxiv.org/format/2404.19358">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> </div> </div> <p class="title is-5 mathjax"> QML-IB: Quantized Collaborative Intelligence between Multiple Devices and the Mobile Network </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Peng%2C+J">Jingchen Peng</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+B">Boxiang Ren</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+L">Lu Yang</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+C">Chenghui Peng</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+P">Panpan Niu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+H">Hao Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.19358v1-abstract-short" style="display: inline;"> The integration of artificial intelligence (AI) and mobile networks is regarded as one of the most important scenarios for 6G. In 6G, a major objective is to realize the efficient transmission of task-relevant data. Then a key problem arises, how to design collaborative AI models for the device side and the network side, so that the transmitted data between the device and the network is efficient… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.19358v1-abstract-full').style.display = 'inline'; document.getElementById('2404.19358v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.19358v1-abstract-full" style="display: none;"> The integration of artificial intelligence (AI) and mobile networks is regarded as one of the most important scenarios for 6G. In 6G, a major objective is to realize the efficient transmission of task-relevant data. Then a key problem arises, how to design collaborative AI models for the device side and the network side, so that the transmitted data between the device and the network is efficient enough, which means the transmission overhead is low but the AI task result is accurate. In this paper, we propose the multi-link information bottleneck (ML-IB) scheme for such collaborative models design. We formulate our problem based on a novel performance metric, which can evaluate both task accuracy and transmission overhead. Then we introduce a quantizer that is adjustable in the quantization bit depth, amplitudes, and breakpoints. Given the infeasibility of calculating our proposed metric on high-dimensional data, we establish a variational upper bound for this metric. However, due to the incorporation of quantization, the closed form of the variational upper bound remains uncomputable. Hence, we employ the Log-Sum Inequality to derive an approximation and provide a theoretical guarantee. Based on this, we devise the quantized multi-link information bottleneck (QML-IB) algorithm for collaborative AI models generation. Finally, numerical experiments demonstrate the superior performance of our QML-IB algorithm compared to the state-of-the-art algorithm. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.19358v1-abstract-full').style.display = 'none'; document.getElementById('2404.19358v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.16670">arXiv:2404.16670</a> <span> [<a href="https://arxiv.org/pdf/2404.16670">pdf</a>, <a href="https://arxiv.org/format/2404.16670">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> EmoVIT: Revolutionizing Emotion Insights with Visual Instruction Tuning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xie%2C+H">Hongxia Xie</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+C">Chu-Jun Peng</a>, <a href="/search/cs?searchtype=author&query=Tseng%2C+Y">Yu-Wen Tseng</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hung-Jen Chen</a>, <a href="/search/cs?searchtype=author&query=Hsu%2C+C">Chan-Feng Hsu</a>, <a href="/search/cs?searchtype=author&query=Shuai%2C+H">Hong-Han Shuai</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+W">Wen-Huang Cheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.16670v1-abstract-short" style="display: inline;"> Visual Instruction Tuning represents a novel learning paradigm involving the fine-tuning of pre-trained language models using task-specific instructions. This paradigm shows promising zero-shot results in various natural language processing tasks but is still unexplored in vision emotion understanding. In this work, we focus on enhancing the model's proficiency in understanding and adhering to ins… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.16670v1-abstract-full').style.display = 'inline'; document.getElementById('2404.16670v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.16670v1-abstract-full" style="display: none;"> Visual Instruction Tuning represents a novel learning paradigm involving the fine-tuning of pre-trained language models using task-specific instructions. This paradigm shows promising zero-shot results in various natural language processing tasks but is still unexplored in vision emotion understanding. In this work, we focus on enhancing the model's proficiency in understanding and adhering to instructions related to emotional contexts. Initially, we identify key visual clues critical to visual emotion recognition. Subsequently, we introduce a novel GPT-assisted pipeline for generating emotion visual instruction data, effectively addressing the scarcity of annotated instruction data in this domain. Expanding on the groundwork established by InstructBLIP, our proposed EmoVIT architecture incorporates emotion-specific instruction data, leveraging the powerful capabilities of Large Language Models to enhance performance. Through extensive experiments, our model showcases its proficiency in emotion classification, adeptness in affective reasoning, and competence in comprehending humor. The comparative analysis provides a robust benchmark for Emotion Visual Instruction Tuning in the era of LLMs, providing valuable insights and opening avenues for future exploration in this domain. Our code is available at \url{https://github.com/aimmemotion/EmoVIT}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.16670v1-abstract-full').style.display = 'none'; document.getElementById('2404.16670v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by CVPR 2024</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Peng%2C+C&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Peng%2C+C&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Peng%2C+C&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Peng%2C+C&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Peng%2C+C&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Peng%2C+C&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li> <a href="/search/?searchtype=author&query=Peng%2C+C&start=250" class="pagination-link " aria-label="Page 6" aria-current="page">6 </a> </li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository