CINXE.COM
Search | arXiv e-print repository
<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 807 results for author: <span class="mathjax">Xu, D</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Xu%2C+D">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Xu, D"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Xu%2C+D&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Xu, D"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Xu%2C+D&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Xu%2C+D&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Xu%2C+D&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Xu%2C+D&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Xu%2C+D&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Xu%2C+D&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.10352">arXiv:2502.10352</a> <span> [<a href="https://arxiv.org/pdf/2502.10352">pdf</a>, <a href="https://arxiv.org/format/2502.10352">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Agentic Verification for Ambiguous Query Disambiguation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lee%2C+Y">Youngwon Lee</a>, <a href="/search/cs?searchtype=author&query=Hwang%2C+S">Seung-won Hwang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+R">Ruofan Wu</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+F">Feng Yan</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+D">Danmei Xu</a>, <a href="/search/cs?searchtype=author&query=Akkad%2C+M">Moutasem Akkad</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+Z">Zhewei Yao</a>, <a href="/search/cs?searchtype=author&query=He%2C+Y">Yuxiong He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.10352v1-abstract-short" style="display: inline;"> In this work, we tackle the challenge of disambiguating queries in retrieval-augmented generation (RAG) to diverse yet answerable interpretations. State-of-the-arts follow a Diversify-then-Verify (DtV) pipeline, where diverse interpretations are generated by an LLM, later used as search queries to retrieve supporting passages. Such a process may introduce noise in either interpretations or retriev… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10352v1-abstract-full').style.display = 'inline'; document.getElementById('2502.10352v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.10352v1-abstract-full" style="display: none;"> In this work, we tackle the challenge of disambiguating queries in retrieval-augmented generation (RAG) to diverse yet answerable interpretations. State-of-the-arts follow a Diversify-then-Verify (DtV) pipeline, where diverse interpretations are generated by an LLM, later used as search queries to retrieve supporting passages. Such a process may introduce noise in either interpretations or retrieval, particularly in enterprise settings, where LLMs -- trained on static data -- may struggle with domain-specific disambiguations. Thus, a post-hoc verification phase is introduced to prune noises. Our distinction is to unify diversification with verification by incorporating feedback from retriever and generator early on. This joint approach improves both efficiency and robustness by reducing reliance on multiple retrieval and inference steps, which are susceptible to cascading errors. We validate the efficiency and effectiveness of our method, Verified-Diversification with Consolidation (VERDICT), on the widely adopted ASQA benchmark to achieve diverse yet verifiable interpretations. Empirical results show that VERDICT improves grounding-aware F1 score by an average of 23% over the strongest baseline across different backbone LLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10352v1-abstract-full').style.display = 'none'; document.getElementById('2502.10352v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.09021">arXiv:2502.09021</a> <span> [<a href="https://arxiv.org/pdf/2502.09021">pdf</a>, <a href="https://arxiv.org/format/2502.09021">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> From Occupations to Tasks: A New Perspective on Automatability Prediction Using BERT </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+D">Dawei Xu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+H">Haoran Yang</a>, <a href="/search/cs?searchtype=author&query=Rizoiu%2C+M">Marian-Andrei Rizoiu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+G">Guandong Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.09021v1-abstract-short" style="display: inline;"> As automation technologies continue to advance at an unprecedented rate, concerns about job displacement and the future of work have become increasingly prevalent. While existing research has primarily focused on the potential impact of automation at the occupation level, there has been a lack of investigation into the automatability of individual tasks. This paper addresses this gap by proposing… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09021v1-abstract-full').style.display = 'inline'; document.getElementById('2502.09021v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.09021v1-abstract-full" style="display: none;"> As automation technologies continue to advance at an unprecedented rate, concerns about job displacement and the future of work have become increasingly prevalent. While existing research has primarily focused on the potential impact of automation at the occupation level, there has been a lack of investigation into the automatability of individual tasks. This paper addresses this gap by proposing a BERT-based classifier to predict the automatability of tasks in the forthcoming decade at a granular level leveraging the context and semantics information of tasks. We leverage three public datasets: O*NET Task Statements, ESCO Skills, and Australian Labour Market Insights Tasks, and perform expert annotation. Our BERT-based classifier, fine-tuned on our task statement data, demonstrates superior performance over traditional machine learning models, neural network architectures, and other transformer models. Our findings also indicate that approximately 25.1% of occupations within the O*NET database are at substantial risk of automation, with a diverse spectrum of automation vulnerability across sectors. This research provides a robust tool for assessing the future impact of automation on the labor market, offering valuable insights for policymakers, workers, and industry leaders in the face of rapid technological advancement. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09021v1-abstract-full').style.display = 'none'; document.getElementById('2502.09021v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.08556">arXiv:2502.08556</a> <span> [<a href="https://arxiv.org/pdf/2502.08556">pdf</a>, <a href="https://arxiv.org/format/2502.08556">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> Human-Centric Foundation Models: Perception, Generation and Agentic Modeling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tang%2C+S">Shixiang Tang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yizhou Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Lu Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yuan Wang</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+S">Sida Peng</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+D">Dan Xu</a>, <a href="/search/cs?searchtype=author&query=Ouyang%2C+W">Wanli Ouyang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.08556v1-abstract-short" style="display: inline;"> Human understanding and generation are critical for modeling digital humans and humanoid embodiments. Recently, Human-centric Foundation Models (HcFMs) inspired by the success of generalist models, such as large language and vision models, have emerged to unify diverse human-centric tasks into a single framework, surpassing traditional task-specific approaches. In this survey, we present a compreh… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08556v1-abstract-full').style.display = 'inline'; document.getElementById('2502.08556v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.08556v1-abstract-full" style="display: none;"> Human understanding and generation are critical for modeling digital humans and humanoid embodiments. Recently, Human-centric Foundation Models (HcFMs) inspired by the success of generalist models, such as large language and vision models, have emerged to unify diverse human-centric tasks into a single framework, surpassing traditional task-specific approaches. In this survey, we present a comprehensive overview of HcFMs by proposing a taxonomy that categorizes current approaches into four groups: (1) Human-centric Perception Foundation Models that capture fine-grained features for multi-modal 2D and 3D understanding. (2) Human-centric AIGC Foundation Models that generate high-fidelity, diverse human-related content. (3) Unified Perception and Generation Models that integrate these capabilities to enhance both human understanding and synthesis. (4) Human-centric Agentic Foundation Models that extend beyond perception and generation to learn human-like intelligence and interactive behaviors for humanoid embodied tasks. We review state-of-the-art techniques, discuss emerging challenges and future research directions. This survey aims to serve as a roadmap for researchers and practitioners working towards more robust, versatile, and intelligent digital human and embodiments modeling. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08556v1-abstract-full').style.display = 'none'; document.getElementById('2502.08556v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05540">arXiv:2502.05540</a> <span> [<a href="https://arxiv.org/pdf/2502.05540">pdf</a>, <a href="https://arxiv.org/format/2502.05540">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Demystifying Catastrophic Forgetting in Two-Stage Incremental Object Detector </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+Q">Qirui Wu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shizhou Zhang</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+D">De Cheng</a>, <a href="/search/cs?searchtype=author&query=Xing%2C+Y">Yinghui Xing</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+D">Di Xu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+P">Peng Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yanning Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05540v1-abstract-short" style="display: inline;"> Catastrophic forgetting is a critical chanllenge for incremental object detection (IOD). Most existing methods treat the detector monolithically, relying on instance replay or knowledge distillation without analyzing component-specific forgetting. Through dissection of Faster R-CNN, we reveal a key insight: Catastrophic forgetting is predominantly localized to the RoI Head classifier, while regres… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05540v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05540v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05540v1-abstract-full" style="display: none;"> Catastrophic forgetting is a critical chanllenge for incremental object detection (IOD). Most existing methods treat the detector monolithically, relying on instance replay or knowledge distillation without analyzing component-specific forgetting. Through dissection of Faster R-CNN, we reveal a key insight: Catastrophic forgetting is predominantly localized to the RoI Head classifier, while regressors retain robustness across incremental stages. This finding challenges conventional assumptions, motivating us to develop a framework termed NSGP-RePRE. Regional Prototype Replay (RePRE) mitigates classifier forgetting via replay of two types of prototypes: coarse prototypes represent class-wise semantic centers of RoI features, while fine-grained prototypes model intra-class variations. Null Space Gradient Projection (NSGP) is further introduced to eliminate prototype-feature misalignment by updating the feature extractor in directions orthogonal to subspace of old inputs via gradient projection, aligning RePRE with incremental learning dynamics. Our simple yet effective design allows NSGP-RePRE to achieve state-of-the-art performance on the Pascal VOC and MS COCO datasets under various settings. Our work not only advances IOD methodology but also provide pivotal insights for catastrophic forgetting mitigation in IOD. Code will be available soon. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05540v1-abstract-full').style.display = 'none'; document.getElementById('2502.05540v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">14 pages, 7 figures, 9 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.02817">arXiv:2502.02817</a> <span> [<a href="https://arxiv.org/pdf/2502.02817">pdf</a>, <a href="https://arxiv.org/format/2502.02817">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> A Decade of Action Quality Assessment: Largest Systematic Survey of Trends, Challenges, and Future Directions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yin%2C+H">Hao Yin</a>, <a href="/search/cs?searchtype=author&query=Parmar%2C+P">Paritosh Parmar</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+D">Daoliang Xu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yang Zhang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+T">Tianyou Zheng</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+W">Weiwei Fu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.02817v1-abstract-short" style="display: inline;"> Action Quality Assessment (AQA) -- the ability to quantify the quality of human motion, actions, or skill levels and provide feedback -- has far-reaching implications in areas such as low-cost physiotherapy, sports training, and workforce development. As such, it has become a critical field in computer vision & video understanding over the past decade. Significant progress has been made in AQA met… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02817v1-abstract-full').style.display = 'inline'; document.getElementById('2502.02817v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.02817v1-abstract-full" style="display: none;"> Action Quality Assessment (AQA) -- the ability to quantify the quality of human motion, actions, or skill levels and provide feedback -- has far-reaching implications in areas such as low-cost physiotherapy, sports training, and workforce development. As such, it has become a critical field in computer vision & video understanding over the past decade. Significant progress has been made in AQA methodologies, datasets, & applications, yet a pressing need remains for a comprehensive synthesis of this rapidly evolving field. In this paper, we present a thorough survey of the AQA landscape, systematically reviewing over 200 research papers using the preferred reporting items for systematic reviews & meta-analyses (PRISMA) framework. We begin by covering foundational concepts & definitions, then move to general frameworks & performance metrics, & finally discuss the latest advances in methodologies & datasets. This survey provides a detailed analysis of research trends, performance comparisons, challenges, & future directions. Through this work, we aim to offer a valuable resource for both newcomers & experienced researchers, promoting further exploration & progress in AQA. Data are available at https://haoyin116.github.io/Survey_of_AQA/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02817v1-abstract-full').style.display = 'none'; document.getElementById('2502.02817v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">36 Pages, 20 Figures, 12 Tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.01567">arXiv:2502.01567</a> <span> [<a href="https://arxiv.org/pdf/2502.01567">pdf</a>, <a href="https://arxiv.org/format/2502.01567">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Scalable Language Models with Posterior Inference of Latent Thought Vectors </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kong%2C+D">Deqian Kong</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+M">Minglu Zhao</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+D">Dehong Xu</a>, <a href="/search/cs?searchtype=author&query=Pang%2C+B">Bo Pang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shu Wang</a>, <a href="/search/cs?searchtype=author&query=Honig%2C+E">Edouardo Honig</a>, <a href="/search/cs?searchtype=author&query=Si%2C+Z">Zhangzhang Si</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chuan Li</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+J">Jianwen Xie</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+S">Sirui Xie</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y+N">Ying Nian Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.01567v1-abstract-short" style="display: inline;"> We propose a novel family of language models, Latent-Thought Language Models (LTMs), which incorporate explicit latent thought vectors that follow an explicit prior model in latent space. These latent thought vectors guide the autoregressive generation of ground tokens through a Transformer decoder. Training employs a dual-rate optimization process within the classical variational Bayes framework:… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01567v1-abstract-full').style.display = 'inline'; document.getElementById('2502.01567v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.01567v1-abstract-full" style="display: none;"> We propose a novel family of language models, Latent-Thought Language Models (LTMs), which incorporate explicit latent thought vectors that follow an explicit prior model in latent space. These latent thought vectors guide the autoregressive generation of ground tokens through a Transformer decoder. Training employs a dual-rate optimization process within the classical variational Bayes framework: fast learning of local variational parameters for the posterior distribution of latent vectors, and slow learning of global decoder parameters. Empirical studies reveal that LTMs possess additional scaling dimensions beyond traditional LLMs, yielding a structured design space. Higher sample efficiency can be achieved by increasing training compute per token, with further gains possible by trading model size for more inference steps. Designed based on these scaling properties, LTMs demonstrate superior sample and parameter efficiency compared to conventional autoregressive models and discrete diffusion models. They significantly outperform these counterparts in validation perplexity and zero-shot language modeling. Additionally, LTMs exhibit emergent few-shot in-context reasoning capabilities that scale with model and latent size, and achieve competitive performance in conditional and unconditional text generation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01567v1-abstract-full').style.display = 'none'; document.getElementById('2502.01567v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.16362">arXiv:2501.16362</a> <span> [<a href="https://arxiv.org/pdf/2501.16362">pdf</a>, <a href="https://arxiv.org/format/2501.16362">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Fluid Dynamics">physics.flu-dyn</span> </div> </div> <p class="title is-5 mathjax"> A novel Trunk Branch-net PINN for flow and heat transfer prediction in porous medium </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xing%2C+H">Haoyun Xing</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+K">Kaiyan Jin</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+G">Guice Yao</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+J">Jin Zhao</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+D">Dichu Xu</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+D">Dongsheng Wen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.16362v1-abstract-short" style="display: inline;"> A novel Trunk-Branch (TB)-net physics-informed neural network (PINN) architecture is developed, which is a PINN-based method incorporating trunk and branch nets to capture both global and local features. The aim is to solve four main classes of problems: forward flow problem, forward heat transfer problem, inverse heat transfer problem, and transfer learning problem within the porous medium, which… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16362v1-abstract-full').style.display = 'inline'; document.getElementById('2501.16362v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.16362v1-abstract-full" style="display: none;"> A novel Trunk-Branch (TB)-net physics-informed neural network (PINN) architecture is developed, which is a PINN-based method incorporating trunk and branch nets to capture both global and local features. The aim is to solve four main classes of problems: forward flow problem, forward heat transfer problem, inverse heat transfer problem, and transfer learning problem within the porous medium, which are notoriously complex that could not be handled by origin PINN. In the proposed TB-net PINN architecture, a Fully-connected Neural Network (FNN) is used as the trunk net, followed by separated FNNs as the branch nets with respect to outputs, and automatic differentiation is performed for partial derivatives of outputs with respect to inputs by considering various physical loss. The effectiveness and flexibility of the novel TB-net PINN architecture is demonstrated through a collection of forward problems, and transfer learning validates the feasibility of resource reuse. Combining with the superiority over traditional numerical methods in solving inverse problems, the proposed TB-net PINN shows its great potential for practical engineering applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16362v1-abstract-full').style.display = 'none'; document.getElementById('2501.16362v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">26 pages, 17 figures,</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.06714">arXiv:2501.06714</a> <span> [<a href="https://arxiv.org/pdf/2501.06714">pdf</a>, <a href="https://arxiv.org/format/2501.06714">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> F3D-Gaus: Feed-forward 3D-aware Generation on ImageNet with Cycle-Consistent Gaussian Splatting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yuxin Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Q">Qianyi Wu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+D">Dan Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.06714v2-abstract-short" style="display: inline;"> This paper tackles the problem of generalizable 3D-aware generation from monocular datasets, e.g., ImageNet. The key challenge of this task is learning a robust 3D-aware representation without multi-view or dynamic data, while ensuring consistent texture and geometry across different viewpoints. Although some baseline methods are capable of 3D-aware generation, the quality of the generated images… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06714v2-abstract-full').style.display = 'inline'; document.getElementById('2501.06714v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.06714v2-abstract-full" style="display: none;"> This paper tackles the problem of generalizable 3D-aware generation from monocular datasets, e.g., ImageNet. The key challenge of this task is learning a robust 3D-aware representation without multi-view or dynamic data, while ensuring consistent texture and geometry across different viewpoints. Although some baseline methods are capable of 3D-aware generation, the quality of the generated images still lags behind state-of-the-art 2D generation approaches, which excel in producing high-quality, detailed images. To address this severe limitation, we propose a novel feed-forward pipeline based on pixel-aligned Gaussian Splatting, coined as F3D-Gaus, which can produce more realistic and reliable 3D renderings from monocular inputs. In addition, we introduce a self-supervised cycle-consistent constraint to enforce cross-view consistency in the learned 3D representation. This training strategy naturally allows aggregation of multiple aligned Gaussian primitives and significantly alleviates the interpolation limitations inherent in single-view pixel-aligned Gaussian Splatting. Furthermore, we incorporate video model priors to perform geometry-aware refinement, enhancing the generation of fine details in wide-viewpoint scenarios and improving the model's capability to capture intricate 3D textures. Extensive experiments demonstrate that our approach not only achieves high-quality, multi-view consistent 3D-aware generation from monocular datasets, but also significantly improves training and inference efficiency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06714v2-abstract-full').style.display = 'none'; document.getElementById('2501.06714v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project Page: https://w-ted.github.io/publications/F3D-Gaus</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.05611">arXiv:2501.05611</a> <span> [<a href="https://arxiv.org/pdf/2501.05611">pdf</a>, <a href="https://arxiv.org/format/2501.05611">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Bit-depth color recovery via off-the-shelf super-resolution models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fu%2C+X">Xuanshuo Fu</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+D">Danna Xue</a>, <a href="/search/cs?searchtype=author&query=Vazquez-Corral%2C+J">Javier Vazquez-Corral</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.05611v1-abstract-short" style="display: inline;"> Advancements in imaging technology have enabled hardware to support 10 to 16 bits per channel, facilitating precise manipulation in applications like image editing and video processing. While deep neural networks promise to recover high bit-depth representations, existing methods often rely on scale-invariant image information, limiting performance in certain scenarios. In this paper, we introduce… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.05611v1-abstract-full').style.display = 'inline'; document.getElementById('2501.05611v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.05611v1-abstract-full" style="display: none;"> Advancements in imaging technology have enabled hardware to support 10 to 16 bits per channel, facilitating precise manipulation in applications like image editing and video processing. While deep neural networks promise to recover high bit-depth representations, existing methods often rely on scale-invariant image information, limiting performance in certain scenarios. In this paper, we introduce a novel approach that integrates a super-resolution architecture to extract detailed a priori information from images. By leveraging interpolated data generated during the super-resolution process, our method achieves pixel-level recovery of fine-grained color details. Additionally, we demonstrate that spatial features learned through the super-resolution process significantly contribute to the recovery of detailed color depth information. Experiments on benchmark datasets demonstrate that our approach outperforms state-of-the-art methods, highlighting the potential of super-resolution for high-fidelity color restoration. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.05611v1-abstract-full').style.display = 'none'; document.getElementById('2501.05611v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.04329">arXiv:2501.04329</a> <span> [<a href="https://arxiv.org/pdf/2501.04329">pdf</a>, <a href="https://arxiv.org/format/2501.04329">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> An Efficient Adaptive Compression Method for Human Perception and Machine Vision Tasks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+L">Lei Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhenghao Chen</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Z">Zhihao Hu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+D">Dong Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.04329v1-abstract-short" style="display: inline;"> While most existing neural image compression (NIC) and neural video compression (NVC) methodologies have achieved remarkable success, their optimization is primarily focused on human visual perception. However, with the rapid development of artificial intelligence, many images and videos will be used for various machine vision tasks. Consequently, such existing compression methodologies cannot ach… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.04329v1-abstract-full').style.display = 'inline'; document.getElementById('2501.04329v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.04329v1-abstract-full" style="display: none;"> While most existing neural image compression (NIC) and neural video compression (NVC) methodologies have achieved remarkable success, their optimization is primarily focused on human visual perception. However, with the rapid development of artificial intelligence, many images and videos will be used for various machine vision tasks. Consequently, such existing compression methodologies cannot achieve competitive performance in machine vision. In this work, we introduce an efficient adaptive compression (EAC) method tailored for both human perception and multiple machine vision tasks. Our method involves two key modules: 1), an adaptive compression mechanism, that adaptively selects several subsets from latent features to balance the optimizations for multiple machine vision tasks (e.g., segmentation, and detection) and human vision. 2), a task-specific adapter, that uses the parameter-efficient delta-tuning strategy to stimulate the comprehensive downstream analytical networks for specific machine vision tasks. By using the above two modules, we can optimize the bit-rate costs and improve machine vision performance. In general, our proposed EAC can seamlessly integrate with existing NIC (i.e., Ball茅2018, and Cheng2020) and NVC (i.e., DVC, and FVC) methods. Extensive evaluation on various benchmark datasets (i.e., VOC2007, ILSVRC2012, VOC2012, COCO, UCF101, and DAVIS) shows that our method enhances performance for multiple machine vision tasks while maintaining the quality of human vision. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.04329v1-abstract-full').style.display = 'none'; document.getElementById('2501.04329v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.03631">arXiv:2501.03631</a> <span> [<a href="https://arxiv.org/pdf/2501.03631">pdf</a>, <a href="https://arxiv.org/format/2501.03631">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Exploring Iterative Manifold Constraint for Zero-shot Image Editing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+M">Maomao Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yu Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yunfei Liu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+D">Dong Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.03631v2-abstract-short" style="display: inline;"> Editability and fidelity are two essential demands for text-driven image editing, which expects that the editing area should align with the target prompt and the rest remain unchanged separately. The current cutting-edge editing methods usually obey an "inversion-then-editing" pipeline, where the input image is inverted to an approximate Gaussian noise ${z}_T$, based on which a sampling process is… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03631v2-abstract-full').style.display = 'inline'; document.getElementById('2501.03631v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.03631v2-abstract-full" style="display: none;"> Editability and fidelity are two essential demands for text-driven image editing, which expects that the editing area should align with the target prompt and the rest remain unchanged separately. The current cutting-edge editing methods usually obey an "inversion-then-editing" pipeline, where the input image is inverted to an approximate Gaussian noise ${z}_T$, based on which a sampling process is conducted using the target prompt. Nevertheless, we argue that it is not a good choice to use a near-Gaussian noise as a pivot for further editing since it would bring plentiful fidelity errors. We verify this by a pilot analysis, discovering that intermediate-inverted latents can achieve a better trade-off between editability and fidelity than the fully-inverted ${z}_T$. Based on this, we propose a novel zero-shot editing paradigm dubbed ZZEdit, which first locates a qualified intermediate-inverted latent marked as ${z}_p$ as a better editing pivot, which is sufficient-for-editing while structure-preserving. Then, a ZigZag process is designed to execute denoising and inversion alternately, which progressively inject target guidance to ${z}_p$ while preserving the structure information of $p$ step. Afterwards, to achieve the same step number of inversion and denoising, we execute a pure sampling process under the target prompt. Essentially, our ZZEdit performs iterative manifold constraint between the manifold of $M_{p}$ and $M_{p-1}$, leading to fewer fidelity errors. Extensive experiments highlight the effectiveness of ZZEdit in diverse image editing scenarios compared with the "inversion-then-editing" pipeline. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03631v2-abstract-full').style.display = 'none'; document.getElementById('2501.03631v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.01085">arXiv:2501.01085</a> <span> [<a href="https://arxiv.org/pdf/2501.01085">pdf</a>, <a href="https://arxiv.org/format/2501.01085">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Noise-Resilient Symbolic Regression with Dynamic Gating Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+C">Chenglu Sun</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+S">Shuo Shen</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+W">Wenzhi Tao</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+D">Deyi Xue</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Zixia Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.01085v1-abstract-short" style="display: inline;"> Symbolic regression (SR) has emerged as a pivotal technique for uncovering the intrinsic information within data and enhancing the interpretability of AI models. However, current state-of-the-art (sota) SR methods struggle to perform correct recovery of symbolic expressions from high-noise data. To address this issue, we introduce a novel noise-resilient SR (NRSR) method capable of recovering expr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.01085v1-abstract-full').style.display = 'inline'; document.getElementById('2501.01085v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.01085v1-abstract-full" style="display: none;"> Symbolic regression (SR) has emerged as a pivotal technique for uncovering the intrinsic information within data and enhancing the interpretability of AI models. However, current state-of-the-art (sota) SR methods struggle to perform correct recovery of symbolic expressions from high-noise data. To address this issue, we introduce a novel noise-resilient SR (NRSR) method capable of recovering expressions from high-noise data. Our method leverages a novel reinforcement learning (RL) approach in conjunction with a designed noise-resilient gating module (NGM) to learn symbolic selection policies. The gating module can dynamically filter the meaningless information from high-noise data, thereby demonstrating a high noise-resilient capability for the SR process. And we also design a mixed path entropy (MPE) bonus term in the RL process to increase the exploration capabilities of the policy. Experimental results demonstrate that our method significantly outperforms several popular baselines on benchmarks with high-noise data. Furthermore, our method also can achieve sota performance on benchmarks with clean data, showcasing its robustness and efficacy in SR tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.01085v1-abstract-full').style.display = 'none'; document.getElementById('2501.01085v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages, 2 figures, accepted by AAAI 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.00602">arXiv:2501.00602</a> <span> [<a href="https://arxiv.org/pdf/2501.00602">pdf</a>, <a href="https://arxiv.org/format/2501.00602">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> STORM: Spatio-Temporal Reconstruction Model for Large-Scale Outdoor Scenes </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jiawei Yang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jiahui Huang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yuxiao Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yan Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Boyi Li</a>, <a href="/search/cs?searchtype=author&query=You%2C+Y">Yurong You</a>, <a href="/search/cs?searchtype=author&query=Sharma%2C+A">Apoorva Sharma</a>, <a href="/search/cs?searchtype=author&query=Igl%2C+M">Maximilian Igl</a>, <a href="/search/cs?searchtype=author&query=Karkus%2C+P">Peter Karkus</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+D">Danfei Xu</a>, <a href="/search/cs?searchtype=author&query=Ivanovic%2C+B">Boris Ivanovic</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yue Wang</a>, <a href="/search/cs?searchtype=author&query=Pavone%2C+M">Marco Pavone</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.00602v1-abstract-short" style="display: inline;"> We present STORM, a spatio-temporal reconstruction model designed for reconstructing dynamic outdoor scenes from sparse observations. Existing dynamic reconstruction methods often rely on per-scene optimization, dense observations across space and time, and strong motion supervision, resulting in lengthy optimization times, limited generalization to novel views or scenes, and degenerated quality c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.00602v1-abstract-full').style.display = 'inline'; document.getElementById('2501.00602v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.00602v1-abstract-full" style="display: none;"> We present STORM, a spatio-temporal reconstruction model designed for reconstructing dynamic outdoor scenes from sparse observations. Existing dynamic reconstruction methods often rely on per-scene optimization, dense observations across space and time, and strong motion supervision, resulting in lengthy optimization times, limited generalization to novel views or scenes, and degenerated quality caused by noisy pseudo-labels for dynamics. To address these challenges, STORM leverages a data-driven Transformer architecture that directly infers dynamic 3D scene representations--parameterized by 3D Gaussians and their velocities--in a single forward pass. Our key design is to aggregate 3D Gaussians from all frames using self-supervised scene flows, transforming them to the target timestep to enable complete (i.e., "amodal") reconstructions from arbitrary viewpoints at any moment in time. As an emergent property, STORM automatically captures dynamic instances and generates high-quality masks using only reconstruction losses. Extensive experiments on public datasets show that STORM achieves precise dynamic scene reconstruction, surpassing state-of-the-art per-scene optimization methods (+4.3 to 6.6 PSNR) and existing feed-forward approaches (+2.1 to 4.7 PSNR) in dynamic regions. STORM reconstructs large-scale outdoor scenes in 200ms, supports real-time rendering, and outperforms competitors in scene flow estimation, improving 3D EPE by 0.422m and Acc5 by 28.02%. Beyond reconstruction, we showcase four additional applications of our model, illustrating the potential of self-supervised learning for broader dynamic scene understanding. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.00602v1-abstract-full').style.display = 'none'; document.getElementById('2501.00602v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page at: https://jiawei-yang.github.io/STORM/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.00601">arXiv:2501.00601</a> <span> [<a href="https://arxiv.org/pdf/2501.00601">pdf</a>, <a href="https://arxiv.org/format/2501.00601">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> DreamDrive: Generative 4D Scene Modeling from Street View Images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mao%2C+J">Jiageng Mao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Boyi Li</a>, <a href="/search/cs?searchtype=author&query=Ivanovic%2C+B">Boris Ivanovic</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yuxiao Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yan Wang</a>, <a href="/search/cs?searchtype=author&query=You%2C+Y">Yurong You</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+C">Chaowei Xiao</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+D">Danfei Xu</a>, <a href="/search/cs?searchtype=author&query=Pavone%2C+M">Marco Pavone</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yue Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.00601v2-abstract-short" style="display: inline;"> Synthesizing photo-realistic visual observations from an ego vehicle's driving trajectory is a critical step towards scalable training of self-driving models. Reconstruction-based methods create 3D scenes from driving logs and synthesize geometry-consistent driving videos through neural rendering, but their dependence on costly object annotations limits their ability to generalize to in-the-wild d… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.00601v2-abstract-full').style.display = 'inline'; document.getElementById('2501.00601v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.00601v2-abstract-full" style="display: none;"> Synthesizing photo-realistic visual observations from an ego vehicle's driving trajectory is a critical step towards scalable training of self-driving models. Reconstruction-based methods create 3D scenes from driving logs and synthesize geometry-consistent driving videos through neural rendering, but their dependence on costly object annotations limits their ability to generalize to in-the-wild driving scenarios. On the other hand, generative models can synthesize action-conditioned driving videos in a more generalizable way but often struggle with maintaining 3D visual consistency. In this paper, we present DreamDrive, a 4D spatial-temporal scene generation approach that combines the merits of generation and reconstruction, to synthesize generalizable 4D driving scenes and dynamic driving videos with 3D consistency. Specifically, we leverage the generative power of video diffusion models to synthesize a sequence of visual references and further elevate them to 4D with a novel hybrid Gaussian representation. Given a driving trajectory, we then render 3D-consistent driving videos via Gaussian splatting. The use of generative priors allows our method to produce high-quality 4D scenes from in-the-wild driving data, while neural rendering ensures 3D-consistent video generation from the 4D scenes. Extensive experiments on nuScenes and street view images demonstrate that DreamDrive can generate controllable and generalizable 4D driving scenes, synthesize novel views of driving videos with high fidelity and 3D consistency, decompose static and dynamic elements in a self-supervised manner, and enhance perception and planning tasks for autonomous driving. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.00601v2-abstract-full').style.display = 'none'; document.getElementById('2501.00601v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://pointscoder.github.io/DreamDrive/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.19318">arXiv:2412.19318</a> <span> [<a href="https://arxiv.org/pdf/2412.19318">pdf</a>, <a href="https://arxiv.org/format/2412.19318">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Adaptive Conformal Inference by Betting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Podkopaev%2C+A">Aleksandr Podkopaev</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+D">Darren Xu</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+K">Kuang-Chih Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.19318v1-abstract-short" style="display: inline;"> Conformal prediction is a valuable tool for quantifying predictive uncertainty of machine learning models. However, its applicability relies on the assumption of data exchangeability, a condition which is often not met in real-world scenarios. In this paper, we consider the problem of adaptive conformal inference without any assumptions about the data generating process. Existing approaches for ad… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.19318v1-abstract-full').style.display = 'inline'; document.getElementById('2412.19318v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.19318v1-abstract-full" style="display: none;"> Conformal prediction is a valuable tool for quantifying predictive uncertainty of machine learning models. However, its applicability relies on the assumption of data exchangeability, a condition which is often not met in real-world scenarios. In this paper, we consider the problem of adaptive conformal inference without any assumptions about the data generating process. Existing approaches for adaptive conformal inference are based on optimizing the pinball loss using variants of online gradient descent. A notable shortcoming of such approaches is in their explicit dependence on and sensitivity to the choice of the learning rates. In this paper, we propose a different approach for adaptive conformal inference that leverages parameter-free online convex optimization techniques. We prove that our method controls long-term miscoverage frequency at a nominal level and demonstrate its convincing empirical performance without any need of performing cumbersome parameter tuning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.19318v1-abstract-full').style.display = 'none'; document.getElementById('2412.19318v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.18589">arXiv:2412.18589</a> <span> [<a href="https://arxiv.org/pdf/2412.18589">pdf</a>, <a href="https://arxiv.org/format/2412.18589">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Text-Driven Tumor Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+X">Xinran Li</a>, <a href="/search/cs?searchtype=author&query=Shuai%2C+Y">Yi Shuai</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+C">Chen Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Q">Qi Chen</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Q">Qilong Wu</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+P">Pengfei Guo</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+D">Dong Yang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+C">Can Zhao</a>, <a href="/search/cs?searchtype=author&query=Bassi%2C+P+R+A+S">Pedro R. A. S. Bassi</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+D">Daguang Xu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+K">Kang Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yang Yang</a>, <a href="/search/cs?searchtype=author&query=Yuille%2C+A">Alan Yuille</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Zongwei Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.18589v1-abstract-short" style="display: inline;"> Tumor synthesis can generate examples that AI often misses or over-detects, improving AI performance by training on these challenging cases. However, existing synthesis methods, which are typically unconditional -- generating images from random variables -- or conditioned only by tumor shapes, lack controllability over specific tumor characteristics such as texture, heterogeneity, boundaries, and… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18589v1-abstract-full').style.display = 'inline'; document.getElementById('2412.18589v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.18589v1-abstract-full" style="display: none;"> Tumor synthesis can generate examples that AI often misses or over-detects, improving AI performance by training on these challenging cases. However, existing synthesis methods, which are typically unconditional -- generating images from random variables -- or conditioned only by tumor shapes, lack controllability over specific tumor characteristics such as texture, heterogeneity, boundaries, and pathology type. As a result, the generated tumors may be overly similar or duplicates of existing training data, failing to effectively address AI's weaknesses. We propose a new text-driven tumor synthesis approach, termed TextoMorph, that provides textual control over tumor characteristics. This is particularly beneficial for examples that confuse the AI the most, such as early tumor detection (increasing Sensitivity by +8.5%), tumor segmentation for precise radiotherapy (increasing DSC by +6.3%), and classification between benign and malignant tumors (improving Sensitivity by +8.2%). By incorporating text mined from radiology reports into the synthesis process, we increase the variability and controllability of the synthetic tumors to target AI's failure cases more precisely. Moreover, TextoMorph uses contrastive learning across different texts and CT scans, significantly reducing dependence on scarce image-report pairs (only 141 pairs used in this study) by leveraging a large corpus of 34,035 radiology reports. Finally, we have developed rigorous tests to evaluate synthetic tumors, including Text-Driven Visual Turing Test and Radiomics Pattern Analysis, showing that our synthetic tumors is realistic and diverse in texture, heterogeneity, boundaries, and pathology. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18589v1-abstract-full').style.display = 'none'; document.getElementById('2412.18589v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.18537">arXiv:2412.18537</a> <span> [<a href="https://arxiv.org/pdf/2412.18537">pdf</a>, <a href="https://arxiv.org/format/2412.18537">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Harnessing Large Language Models for Knowledge Graph Question Answering via Adaptive Multi-Aspect Retrieval-Augmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+D">Derong Xu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xinhang Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Ziheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Z">Zhenxi Lin</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Z">Zhihong Zhu</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Z">Zhi Zheng</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+X">Xian Wu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+X">Xiangyu Zhao</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+T">Tong Xu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+E">Enhong Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.18537v2-abstract-short" style="display: inline;"> Large Language Models (LLMs) demonstrate remarkable capabilities, yet struggle with hallucination and outdated knowledge when tasked with complex knowledge reasoning, resulting in factually incorrect outputs. Previous studies have attempted to mitigate it by retrieving factual knowledge from large-scale knowledge graphs (KGs) to assist LLMs in logical reasoning and prediction of answers. However,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18537v2-abstract-full').style.display = 'inline'; document.getElementById('2412.18537v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.18537v2-abstract-full" style="display: none;"> Large Language Models (LLMs) demonstrate remarkable capabilities, yet struggle with hallucination and outdated knowledge when tasked with complex knowledge reasoning, resulting in factually incorrect outputs. Previous studies have attempted to mitigate it by retrieving factual knowledge from large-scale knowledge graphs (KGs) to assist LLMs in logical reasoning and prediction of answers. However, this kind of approach often introduces noise and irrelevant data, especially in situations with extensive context from multiple knowledge aspects. In this way, LLM attention can be potentially mislead from question and relevant information. In our study, we introduce an Adaptive Multi-Aspect Retrieval-augmented over KGs (Amar) framework. This method retrieves knowledge including entities, relations, and subgraphs, and converts each piece of retrieved text into prompt embeddings. The Amar framework comprises two key sub-components: 1) a self-alignment module that aligns commonalities among entities, relations, and subgraphs to enhance retrieved text, thereby reducing noise interference; 2) a relevance gating module that employs a soft gate to learn the relevance score between question and multi-aspect retrieved data, to determine which information should be used to enhance LLMs' output, or even filtered altogether. Our method has achieved state-of-the-art performance on two common datasets, WebQSP and CWQ, showing a 1.9\% improvement in accuracy over its best competitor and a 6.6\% improvement in logical form generation over a method that directly uses retrieved text as context prompts. These results demonstrate the effectiveness of Amar in improving the reasoning of LLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18537v2-abstract-full').style.display = 'none'; document.getElementById('2412.18537v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by AAAI'2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.18235">arXiv:2412.18235</a> <span> [<a href="https://arxiv.org/pdf/2412.18235">pdf</a>, <a href="https://arxiv.org/format/2412.18235">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Band Prompting Aided SAR and Multi-Spectral Data Fusion Framework for Local Climate Zone Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lan%2C+H">Haiyan Lan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shujun Li</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+M">Mingjie Xie</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+X">Xuanjia Zhao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Hongning Liu</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+P">Pengming Feng</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+D">Dongli Xu</a>, <a href="/search/cs?searchtype=author&query=He%2C+G">Guangjun He</a>, <a href="/search/cs?searchtype=author&query=Guan%2C+J">Jian Guan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.18235v1-abstract-short" style="display: inline;"> Local climate zone (LCZ) classification is of great value for understanding the complex interactions between urban development and local climate. Recent studies have increasingly focused on the fusion of synthetic aperture radar (SAR) and multi-spectral data to improve LCZ classification performance. However, it remains challenging due to the distinct physical properties of these two types of data… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18235v1-abstract-full').style.display = 'inline'; document.getElementById('2412.18235v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.18235v1-abstract-full" style="display: none;"> Local climate zone (LCZ) classification is of great value for understanding the complex interactions between urban development and local climate. Recent studies have increasingly focused on the fusion of synthetic aperture radar (SAR) and multi-spectral data to improve LCZ classification performance. However, it remains challenging due to the distinct physical properties of these two types of data and the absence of effective fusion guidance. In this paper, a novel band prompting aided data fusion framework is proposed for LCZ classification, namely BP-LCZ, which utilizes textual prompts associated with band groups to guide the model in learning the physical attributes of different bands and semantics of various categories inherent in SAR and multi-spectral data to augment the fused feature, thus enhancing LCZ classification performance. Specifically, a band group prompting (BGP) strategy is introduced to align the visual representation effectively at the level of band groups, which also facilitates a more adequate extraction of semantic information of different bands with textual information. In addition, a multivariate supervised matrix (MSM) based training strategy is proposed to alleviate the problem of positive and negative sample confusion by completing the supervised information. The experimental results demonstrate the effectiveness and superiority of the proposed data fusion framework. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18235v1-abstract-full').style.display = 'none'; document.getElementById('2412.18235v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICASSP 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.17290">arXiv:2412.17290</a> <span> [<a href="https://arxiv.org/pdf/2412.17290">pdf</a>, <a href="https://arxiv.org/format/2412.17290">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Free-viewpoint Human Animation with Pose-correlated Reference Selection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hong%2C+F">Fa-Ting Hong</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zhan Xu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Haiyang Liu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Q">Qinjie Lin</a>, <a href="/search/cs?searchtype=author&query=Song%2C+L">Luchuan Song</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+Z">Zhixin Shu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yang Zhou</a>, <a href="/search/cs?searchtype=author&query=Ceylan%2C+D">Duygu Ceylan</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+D">Dan Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.17290v2-abstract-short" style="display: inline;"> Diffusion-based human animation aims to animate a human character based on a source human image as well as driving signals such as a sequence of poses. Leveraging the generative capacity of diffusion model, existing approaches are able to generate high-fidelity poses, but struggle with significant viewpoint changes, especially in zoom-in/zoom-out scenarios where camera-character distance varies. T… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.17290v2-abstract-full').style.display = 'inline'; document.getElementById('2412.17290v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.17290v2-abstract-full" style="display: none;"> Diffusion-based human animation aims to animate a human character based on a source human image as well as driving signals such as a sequence of poses. Leveraging the generative capacity of diffusion model, existing approaches are able to generate high-fidelity poses, but struggle with significant viewpoint changes, especially in zoom-in/zoom-out scenarios where camera-character distance varies. This limits the applications such as cinematic shot type plan or camera control. We propose a pose-correlated reference selection diffusion network, supporting substantial viewpoint variations in human animation. Our key idea is to enable the network to utilize multiple reference images as input, since significant viewpoint changes often lead to missing appearance details on the human body. To eliminate the computational cost, we first introduce a novel pose correlation module to compute similarities between non-aligned target and source poses, and then propose an adaptive reference selection strategy, utilizing the attention map to identify key regions for animation generation. To train our model, we curated a large dataset from public TED talks featuring varied shots of the same character, helping the model learn synthesis for different perspectives. Our experimental results show that with the same number of reference images, our model performs favorably compared to the current SOTA methods under large viewpoint change. We further show that the adaptive reference selection is able to choose the most relevant reference regions to generate humans under free viewpoints. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.17290v2-abstract-full').style.display = 'none'; document.getElementById('2412.17290v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Under review; Project page: https://harlanhong.github.io/publications/fvhuman/index.html</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.16674">arXiv:2412.16674</a> <span> [<a href="https://arxiv.org/pdf/2412.16674">pdf</a>, <a href="https://arxiv.org/format/2412.16674">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> STAMPsy: Towards SpatioTemporal-Aware Mixed-Type Dialogues for Psychological Counseling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jieyi Wang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yue Huang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zeming Liu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+D">Dexuan Xu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chuan Wang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+X">Xiaoming Shi</a>, <a href="/search/cs?searchtype=author&query=Guan%2C+R">Ruiyuan Guan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hongxing Wang</a>, <a href="/search/cs?searchtype=author&query=Yue%2C+W">Weihua Yue</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yu Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.16674v1-abstract-short" style="display: inline;"> Online psychological counseling dialogue systems are trending, offering a convenient and accessible alternative to traditional in-person therapy. However, existing psychological counseling dialogue systems mainly focus on basic empathetic dialogue or QA with minimal professional knowledge and without goal guidance. In many real-world counseling scenarios, clients often seek multi-type help, such a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16674v1-abstract-full').style.display = 'inline'; document.getElementById('2412.16674v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.16674v1-abstract-full" style="display: none;"> Online psychological counseling dialogue systems are trending, offering a convenient and accessible alternative to traditional in-person therapy. However, existing psychological counseling dialogue systems mainly focus on basic empathetic dialogue or QA with minimal professional knowledge and without goal guidance. In many real-world counseling scenarios, clients often seek multi-type help, such as diagnosis, consultation, therapy, console, and common questions, but existing dialogue systems struggle to combine different dialogue types naturally. In this paper, we identify this challenge as how to construct mixed-type dialogue systems for psychological counseling that enable clients to clarify their goals before proceeding with counseling. To mitigate the challenge, we collect a mixed-type counseling dialogues corpus termed STAMPsy, covering five dialogue types, task-oriented dialogue for diagnosis, knowledge-grounded dialogue, conversational recommendation, empathetic dialogue, and question answering, over 5,000 conversations. Moreover, spatiotemporal-aware knowledge enables systems to have world awareness and has been proven to affect one's mental health. Therefore, we link dialogues in STAMPsy to spatiotemporal state and propose a spatiotemporal-aware mixed-type psychological counseling dataset. Additionally, we build baselines on STAMPsy and develop an iterative self-feedback psychological dialogue generation framework, named Self-STAMPsy. Results indicate that clarifying dialogue goals in advance and utilizing spatiotemporal states are effective. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16674v1-abstract-full').style.display = 'none'; document.getElementById('2412.16674v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.15537">arXiv:2412.15537</a> <span> [<a href="https://arxiv.org/pdf/2412.15537">pdf</a>, <a href="https://arxiv.org/format/2412.15537">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Large-scale UAV Route Planing with Global and Local Features via Reinforcement Graph Fusion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+T">Tao Zhou</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+K">Kai Ye</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Z">Zeyu Shi</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+J">Jiajing Lin</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+D">Dejun Xu</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+M">Min Jiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.15537v1-abstract-short" style="display: inline;"> Numerous remarkable advancements have been made in accuracy, speed, and parallelism for solving the Unmanned Aerial Vehicle Route Planing (UAVRP). However, existing UAVRP solvers face challenges when attempting to scale effectively and efficiently for larger instances. In this paper, we present a generalization framework that enables current UAVRP solvers to robustly extend their capabilities to l… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.15537v1-abstract-full').style.display = 'inline'; document.getElementById('2412.15537v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.15537v1-abstract-full" style="display: none;"> Numerous remarkable advancements have been made in accuracy, speed, and parallelism for solving the Unmanned Aerial Vehicle Route Planing (UAVRP). However, existing UAVRP solvers face challenges when attempting to scale effectively and efficiently for larger instances. In this paper, we present a generalization framework that enables current UAVRP solvers to robustly extend their capabilities to larger instances, accommodating up to 10,000 points, using widely recognized test sets. The UAVRP under a large number of patrol points is a typical large-scale TSP problem.Our proposed framework comprises three distinct steps. Firstly, we employ Delaunay triangulation to extract subgraphs from large instances while preserving global features. Secondly, we utilize an embedded TSP solver to obtain sub-results, followed by graph fusion. Finally, we implement a decoding strategy customizable to the user's requirements, resulting in high-quality solutions, complemented by a warming-up process for the heatmap. To demonstrate the flexibility of our approach, we integrate two representative TSP solvers into our framework and conduct a comprehensive comparative analysis against existing algorithms using large TSP benchmark datasets. The results unequivocally demonstrate that our framework efficiently scales existing TSP solvers to handle large instances and consistently outperforms state-of-the-art (SOTA) methods. Furthermore, since our proposed framework does not necessitate additional training or fine-tuning, we believe that its generality can significantly advance research on end-to-end UAVRP solvers, enabling the application of a broader range of methods to real-world scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.15537v1-abstract-full').style.display = 'none'; document.getElementById('2412.15537v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.14718">arXiv:2412.14718</a> <span> [<a href="https://arxiv.org/pdf/2412.14718">pdf</a>, <a href="https://arxiv.org/format/2412.14718">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> A Comprehensive Forecasting Framework based on Multi-Stage Hierarchical Forecasting Reconciliation and Adjustment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhengchao Yang</a>, <a href="/search/cs?searchtype=author&query=Ghosh%2C+M">Mithun Ghosh</a>, <a href="/search/cs?searchtype=author&query=Saha%2C+A">Anish Saha</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+D">Dong Xu</a>, <a href="/search/cs?searchtype=author&query=Shmakov%2C+K">Konstantin Shmakov</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+K">Kuang-chih Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.14718v1-abstract-short" style="display: inline;"> Ads demand forecasting for Walmart's ad products plays a critical role in enabling effective resource planning, allocation, and management of ads performance. In this paper, we introduce a comprehensive demand forecasting system that tackles hierarchical time series forecasting in business settings. Though traditional hierarchical reconciliation methods ensure forecasting coherence, they often tra… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.14718v1-abstract-full').style.display = 'inline'; document.getElementById('2412.14718v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.14718v1-abstract-full" style="display: none;"> Ads demand forecasting for Walmart's ad products plays a critical role in enabling effective resource planning, allocation, and management of ads performance. In this paper, we introduce a comprehensive demand forecasting system that tackles hierarchical time series forecasting in business settings. Though traditional hierarchical reconciliation methods ensure forecasting coherence, they often trade off accuracy for coherence especially at lower levels and fail to capture the seasonality unique to each time-series in the hierarchy. Thus, we propose a novel framework "Multi-Stage Hierarchical Forecasting Reconciliation and Adjustment (Multi-Stage HiFoReAd)" to address the challenges of preserving seasonality, ensuring coherence, and improving accuracy. Our system first utilizes diverse models, ensembled through Bayesian Optimization (BO), achieving base forecasts. The generated base forecasts are then passed into the Multi-Stage HiFoReAd framework. The initial stage refines the hierarchy using Top-Down forecasts and "harmonic alignment." The second stage aligns the higher levels' forecasts using MinTrace algorithm, following which the last two levels undergo "harmonic alignment" and "stratified scaling", to eventually achieve accurate and coherent forecasts across the whole hierarchy. Our experiments on Walmart's internal Ads-demand dataset and 3 other public datasets, each with 4 hierarchical levels, demonstrate that the average Absolute Percentage Error from the cross-validation sets improve from 3% to 40% across levels against BO-ensemble of models (LGBM, MSTL+ETS, Prophet) as well as from 1.2% to 92.9% against State-Of-The-Art models. In addition, the forecasts at all hierarchical levels are proved to be coherent. The proposed framework has been deployed and leveraged by Walmart's ads, sales and operations teams to track future demands, make informed decisions and plan resources. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.14718v1-abstract-full').style.display = 'none'; document.getElementById('2412.14718v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published in 2024 IEEE International Conference on Big Data (BigData)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.13667">arXiv:2412.13667</a> <span> [<a href="https://arxiv.org/pdf/2412.13667">pdf</a>, <a href="https://arxiv.org/format/2412.13667">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Methodology">stat.ME</span> </div> </div> <p class="title is-5 mathjax"> Exploring Multi-Modal Integration with Tool-Augmented LLM Agents for Precise Causal Discovery </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shen%2C+C">ChengAo Shen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhengzhang Chen</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+D">Dongsheng Luo</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+D">Dongkuan Xu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Haifeng Chen</a>, <a href="/search/cs?searchtype=author&query=Ni%2C+J">Jingchao Ni</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.13667v1-abstract-short" style="display: inline;"> Causal inference is an imperative foundation for decision-making across domains, such as smart health, AI for drug discovery and AIOps. Traditional statistical causal discovery methods, while well-established, predominantly rely on observational data and often overlook the semantic cues inherent in cause-and-effect relationships. The advent of Large Language Models (LLMs) has ushered in an afforda… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.13667v1-abstract-full').style.display = 'inline'; document.getElementById('2412.13667v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.13667v1-abstract-full" style="display: none;"> Causal inference is an imperative foundation for decision-making across domains, such as smart health, AI for drug discovery and AIOps. Traditional statistical causal discovery methods, while well-established, predominantly rely on observational data and often overlook the semantic cues inherent in cause-and-effect relationships. The advent of Large Language Models (LLMs) has ushered in an affordable way of leveraging the semantic cues for knowledge-driven causal discovery, but the development of LLMs for causal discovery lags behind other areas, particularly in the exploration of multi-modality data. To bridge the gap, we introduce MATMCD, a multi-agent system powered by tool-augmented LLMs. MATMCD has two key agents: a Data Augmentation agent that retrieves and processes modality-augmented data, and a Causal Constraint agent that integrates multi-modal data for knowledge-driven inference. Delicate design of the inner-workings ensures successful cooperation of the agents. Our empirical study across seven datasets suggests the significant potential of multi-modality enhanced causal discovery. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.13667v1-abstract-full').style.display = 'none'; document.getElementById('2412.13667v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.13324">arXiv:2412.13324</a> <span> [<a href="https://arxiv.org/pdf/2412.13324">pdf</a>, <a href="https://arxiv.org/format/2412.13324">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> BadSAD: Clean-Label Backdoor Attacks against Deep Semi-Supervised Anomaly Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cheng%2C+H">He Cheng</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+D">Depeng Xu</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+S">Shuhan Yuan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.13324v1-abstract-short" style="display: inline;"> Image anomaly detection (IAD) is essential in applications such as industrial inspection, medical imaging, and security. Despite the progress achieved with deep learning models like Deep Semi-Supervised Anomaly Detection (DeepSAD), these models remain susceptible to backdoor attacks, presenting significant security challenges. In this paper, we introduce BadSAD, a novel backdoor attack framework s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.13324v1-abstract-full').style.display = 'inline'; document.getElementById('2412.13324v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.13324v1-abstract-full" style="display: none;"> Image anomaly detection (IAD) is essential in applications such as industrial inspection, medical imaging, and security. Despite the progress achieved with deep learning models like Deep Semi-Supervised Anomaly Detection (DeepSAD), these models remain susceptible to backdoor attacks, presenting significant security challenges. In this paper, we introduce BadSAD, a novel backdoor attack framework specifically designed to target DeepSAD models. Our approach involves two key phases: trigger injection, where subtle triggers are embedded into normal images, and latent space manipulation, which positions and clusters the poisoned images near normal images to make the triggers appear benign. Extensive experiments on benchmark datasets validate the effectiveness of our attack strategy, highlighting the severe risks that backdoor attacks pose to deep learning-based anomaly detection systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.13324v1-abstract-full').style.display = 'none'; document.getElementById('2412.13324v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.6.e; I.5.4 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.13163">arXiv:2412.13163</a> <span> [<a href="https://arxiv.org/pdf/2412.13163">pdf</a>, <a href="https://arxiv.org/format/2412.13163">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> C-FedRAG: A Confidential Federated Retrieval-Augmented Generation System </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Addison%2C+P">Parker Addison</a>, <a href="/search/cs?searchtype=author&query=Nguyen%2C+M+H">Minh-Tuan H. Nguyen</a>, <a href="/search/cs?searchtype=author&query=Medan%2C+T">Tomislav Medan</a>, <a href="/search/cs?searchtype=author&query=Shah%2C+J">Jinali Shah</a>, <a href="/search/cs?searchtype=author&query=Manzari%2C+M+T">Mohammad T. Manzari</a>, <a href="/search/cs?searchtype=author&query=McElrone%2C+B">Brendan McElrone</a>, <a href="/search/cs?searchtype=author&query=Lalwani%2C+L">Laksh Lalwani</a>, <a href="/search/cs?searchtype=author&query=More%2C+A">Aboli More</a>, <a href="/search/cs?searchtype=author&query=Sharma%2C+S">Smita Sharma</a>, <a href="/search/cs?searchtype=author&query=Roth%2C+H+R">Holger R. Roth</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+I">Isaac Yang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C">Chester Chen</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+D">Daguang Xu</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+Y">Yan Cheng</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+A">Andrew Feng</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Ziyue Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.13163v2-abstract-short" style="display: inline;"> Organizations seeking to utilize Large Language Models (LLMs) for knowledge querying and analysis often encounter challenges in maintaining an LLM fine-tuned on targeted, up-to-date information that keeps answers relevant and grounded. Retrieval Augmented Generation (RAG) has quickly become a feasible solution for organizations looking to overcome the challenges of maintaining proprietary models a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.13163v2-abstract-full').style.display = 'inline'; document.getElementById('2412.13163v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.13163v2-abstract-full" style="display: none;"> Organizations seeking to utilize Large Language Models (LLMs) for knowledge querying and analysis often encounter challenges in maintaining an LLM fine-tuned on targeted, up-to-date information that keeps answers relevant and grounded. Retrieval Augmented Generation (RAG) has quickly become a feasible solution for organizations looking to overcome the challenges of maintaining proprietary models and to help reduce LLM hallucinations in their query responses. However, RAG comes with its own issues regarding scaling data pipelines across tiered-access and disparate data sources. In many scenarios, it is necessary to query beyond a single data silo to provide richer and more relevant context for an LLM. Analyzing data sources within and across organizational trust boundaries is often limited by complex data-sharing policies that prohibit centralized data storage, therefore, inhibit the fast and effective setup and scaling of RAG solutions. In this paper, we introduce Confidential Computing (CC) techniques as a solution for secure Federated Retrieval Augmented Generation (FedRAG). Our proposed Confidential FedRAG system (C-FedRAG) enables secure connection and scaling of a RAG workflows across a decentralized network of data providers by ensuring context confidentiality. We also demonstrate how to implement a C-FedRAG system using the NVIDIA FLARE SDK and assess its performance using the MedRAG toolkit and MIRAGE benchmarking dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.13163v2-abstract-full').style.display = 'none'; document.getElementById('2412.13163v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.12579">arXiv:2412.12579</a> <span> [<a href="https://arxiv.org/pdf/2412.12579">pdf</a>, <a href="https://arxiv.org/format/2412.12579">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Programming Languages">cs.PL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Operating Systems">cs.OS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> Scaling Inter-procedural Dataflow Analysis on the Cloud </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+Z">Zewen Sun</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yujin Zhang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+D">Duanchen Xu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yiyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+Y">Yun Qi</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yueyang Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yi Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhaokang Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yue Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xuandong Li</a>, <a href="/search/cs?searchtype=author&query=Zuo%2C+Z">Zhiqiang Zuo</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Q">Qingda Lu</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+W">Wenwen Peng</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+S">Shengjian Guo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.12579v1-abstract-short" style="display: inline;"> Apart from forming the backbone of compiler optimization, static dataflow analysis has been widely applied in a vast variety of applications, such as bug detection, privacy analysis, program comprehension, etc. Despite its importance, performing interprocedural dataflow analysis on large-scale programs is well known to be challenging. In this paper, we propose a novel distributed analysis framewor… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.12579v1-abstract-full').style.display = 'inline'; document.getElementById('2412.12579v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.12579v1-abstract-full" style="display: none;"> Apart from forming the backbone of compiler optimization, static dataflow analysis has been widely applied in a vast variety of applications, such as bug detection, privacy analysis, program comprehension, etc. Despite its importance, performing interprocedural dataflow analysis on large-scale programs is well known to be challenging. In this paper, we propose a novel distributed analysis framework supporting the general interprocedural dataflow analysis. Inspired by large-scale graph processing, we devise dedicated distributed worklist algorithms for both whole-program analysis and incremental analysis. We implement these algorithms and develop a distributed framework called BigDataflow running on a large-scale cluster. The experimental results validate the promising performance of BigDataflow -- BigDataflow can finish analyzing the program of millions lines of code in minutes. Compared with the state-of-the-art, BigDataflow achieves much more analysis efficiency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.12579v1-abstract-full').style.display = 'none'; document.getElementById('2412.12579v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.11102">arXiv:2412.11102</a> <span> [<a href="https://arxiv.org/pdf/2412.11102">pdf</a>, <a href="https://arxiv.org/format/2412.11102">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Empowering LLMs to Understand and Generate Complex Vector Graphics </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xing%2C+X">Ximing Xing</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+J">Juncheng Hu</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+G">Guotao Liang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jing Zhang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+D">Dong Xu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Q">Qian Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.11102v2-abstract-short" style="display: inline;"> The unprecedented advancements in Large Language Models (LLMs) have profoundly impacted natural language processing but have yet to fully embrace the realm of scalable vector graphics (SVG) generation. While LLMs encode partial knowledge of SVG data from web pages during training, recent findings suggest that semantically ambiguous and tokenized representations within LLMs may result in hallucinat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.11102v2-abstract-full').style.display = 'inline'; document.getElementById('2412.11102v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.11102v2-abstract-full" style="display: none;"> The unprecedented advancements in Large Language Models (LLMs) have profoundly impacted natural language processing but have yet to fully embrace the realm of scalable vector graphics (SVG) generation. While LLMs encode partial knowledge of SVG data from web pages during training, recent findings suggest that semantically ambiguous and tokenized representations within LLMs may result in hallucinations in vector primitive predictions. Additionally, LLM training typically lacks modeling and understanding of the rendering sequence of vector paths, which can lead to occlusion between output vector primitives. In this paper, we present LLM4SVG, an initial yet substantial step toward bridging this gap by enabling LLMs to better understand and generate vector graphics. LLM4SVG facilitates a deeper understanding of SVG components through learnable semantic tokens, which precisely encode these tokens and their corresponding properties to generate semantically aligned SVG outputs. Using a series of learnable semantic tokens, a structured dataset for instruction following is developed to support comprehension and generation across two primary tasks. Our method introduces a modular architecture to existing large language models, integrating semantic tags, vector instruction encoders, fine-tuned commands, and powerful LLMs to tightly combine geometric, appearance, and language information. To overcome the scarcity of SVG-text instruction data, we developed an automated data generation pipeline that collected a massive dataset of more than 250k SVG data and 580k SVG-text instructions, which facilitated the adoption of the two-stage training strategy popular in LLM development. By exploring various training strategies, we developed LLM4SVG, which significantly moves beyond optimized rendering-based approaches and language-model-based baselines to achieve remarkable results in human evaluation tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.11102v2-abstract-full').style.display = 'none'; document.getElementById('2412.11102v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project Page: https://ximinng.github.io/LLM4SVGProject/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.10629">arXiv:2412.10629</a> <span> [<a href="https://arxiv.org/pdf/2412.10629">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Rapid Reconstruction of Extremely Accelerated Liver 4D MRI via Chained Iterative Refinement </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+D">Di Xu</a>, <a href="/search/cs?searchtype=author&query=Miao%2C+X">Xin Miao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Hengjie Liu</a>, <a href="/search/cs?searchtype=author&query=Scholey%2C+J+E">Jessica E. Scholey</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+W">Wensha Yang</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+M">Mary Feng</a>, <a href="/search/cs?searchtype=author&query=Ohliger%2C+M">Michael Ohliger</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H">Hui Lin</a>, <a href="/search/cs?searchtype=author&query=Lao%2C+Y">Yi Lao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yang Yang</a>, <a href="/search/cs?searchtype=author&query=Sheng%2C+K">Ke Sheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.10629v1-abstract-short" style="display: inline;"> Abstract Purpose: High-quality 4D MRI requires an impractically long scanning time for dense k-space signal acquisition covering all respiratory phases. Accelerated sparse sampling followed by reconstruction enhancement is desired but often results in degraded image quality and long reconstruction time. We hereby propose the chained iterative reconstruction network (CIRNet) for efficient sparse-sa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.10629v1-abstract-full').style.display = 'inline'; document.getElementById('2412.10629v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.10629v1-abstract-full" style="display: none;"> Abstract Purpose: High-quality 4D MRI requires an impractically long scanning time for dense k-space signal acquisition covering all respiratory phases. Accelerated sparse sampling followed by reconstruction enhancement is desired but often results in degraded image quality and long reconstruction time. We hereby propose the chained iterative reconstruction network (CIRNet) for efficient sparse-sampling reconstruction while maintaining clinically deployable quality. Methods: CIRNet adopts the denoising diffusion probabilistic framework to condition the image reconstruction through a stochastic iterative denoising process. During training, a forward Markovian diffusion process is designed to gradually add Gaussian noise to the densely sampled ground truth (GT), while CIRNet is optimized to iteratively reverse the Markovian process from the forward outputs. At the inference stage, CIRNet performs the reverse process solely to recover signals from noise, conditioned upon the undersampled input. CIRNet processed the 4D data (3D+t) as temporal slices (2D+t). The proposed framework is evaluated on a data cohort consisting of 48 patients (12332 temporal slices) who underwent free-breathing liver 4D MRI. 3-, 6-, 10-, 20- and 30-times acceleration were examined with a retrospective random undersampling scheme. Compressed sensing (CS) reconstruction with a spatiotemporal constraint and a recently proposed deep network, Re-Con-GAN, are selected as baselines. Results: CIRNet consistently achieved superior performance compared to CS and Re-Con-GAN. The inference time of CIRNet, CS, and Re-Con-GAN are 11s, 120s, and 0.15s. Conclusion: A novel framework, CIRNet, is presented. CIRNet maintains useable image quality for acceleration up to 30 times, significantly reducing the burden of 4DMRI. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.10629v1-abstract-full').style.display = 'none'; document.getElementById('2412.10629v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.10437">arXiv:2412.10437</a> <span> [<a href="https://arxiv.org/pdf/2412.10437">pdf</a>, <a href="https://arxiv.org/format/2412.10437">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> SVGFusion: Scalable Text-to-SVG Generation via Vector Space Diffusion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xing%2C+X">Ximing Xing</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+J">Juncheng Hu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jing Zhang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+D">Dong Xu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Q">Qian Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.10437v1-abstract-short" style="display: inline;"> The generation of Scalable Vector Graphics (SVG) assets from textual data remains a significant challenge, largely due to the scarcity of high-quality vector datasets and the limitations in scalable vector representations required for modeling intricate graphic distributions. This work introduces SVGFusion, a Text-to-SVG model capable of scaling to real-world SVG data without reliance on a text-ba… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.10437v1-abstract-full').style.display = 'inline'; document.getElementById('2412.10437v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.10437v1-abstract-full" style="display: none;"> The generation of Scalable Vector Graphics (SVG) assets from textual data remains a significant challenge, largely due to the scarcity of high-quality vector datasets and the limitations in scalable vector representations required for modeling intricate graphic distributions. This work introduces SVGFusion, a Text-to-SVG model capable of scaling to real-world SVG data without reliance on a text-based discrete language model or prolonged SDS optimization. The essence of SVGFusion is to learn a continuous latent space for vector graphics with a popular Text-to-Image framework. Specifically, SVGFusion consists of two modules: a Vector-Pixel Fusion Variational Autoencoder (VP-VAE) and a Vector Space Diffusion Transformer (VS-DiT). VP-VAE takes both the SVGs and corresponding rasterizations as inputs and learns a continuous latent space, whereas VS-DiT learns to generate a latent code within this space based on the text prompt. Based on VP-VAE, a novel rendering sequence modeling strategy is proposed to enable the latent space to embed the knowledge of construction logics in SVGs. This empowers the model to achieve human-like design capabilities in vector graphics, while systematically preventing occlusion in complex graphic compositions. Moreover, our SVGFusion's ability can be continuously improved by leveraging the scalability of the VS-DiT by adding more VS-DiT blocks. A large-scale SVG dataset is collected to evaluate the effectiveness of our proposed method. Extensive experimentation has confirmed the superiority of our SVGFusion over existing SVG generation methods, achieving enhanced quality and generalizability, thereby establishing a novel framework for SVG content creation. Code, model, and data will be released at: \href{https://ximinng.github.io/SVGFusionProject/}{https://ximinng.github.io/SVGFusionProject/} <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.10437v1-abstract-full').style.display = 'none'; document.getElementById('2412.10437v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">project page: \href{https://ximinng.github.io/SVGFusionProject/}{https://ximinng.github.io/SVGFusionProject/}</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.08519">arXiv:2412.08519</a> <span> [<a href="https://arxiv.org/pdf/2412.08519">pdf</a>, <a href="https://arxiv.org/format/2412.08519">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Bridging Relevance and Reasoning: Rationale Distillation in Retrieval-Augmented Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jia%2C+P">Pengyue Jia</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+D">Derong Xu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiaopeng Li</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Z">Zhaocheng Du</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiangyang Li</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+X">Xiangyu Zhao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yichao Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yuhao Wang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+H">Huifeng Guo</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+R">Ruiming Tang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.08519v1-abstract-short" style="display: inline;"> The reranker and generator are two critical components in the Retrieval-Augmented Generation (i.e., RAG) pipeline, responsible for ranking relevant documents and generating responses. However, due to differences in pre-training data and objectives, there is an inevitable gap between the documents ranked as relevant by the reranker and those required by the generator to support answering the query.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.08519v1-abstract-full').style.display = 'inline'; document.getElementById('2412.08519v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.08519v1-abstract-full" style="display: none;"> The reranker and generator are two critical components in the Retrieval-Augmented Generation (i.e., RAG) pipeline, responsible for ranking relevant documents and generating responses. However, due to differences in pre-training data and objectives, there is an inevitable gap between the documents ranked as relevant by the reranker and those required by the generator to support answering the query. To address this gap, we propose RADIO, a novel and practical preference alignment framework with RAtionale DIstillatiOn. Specifically, We first propose a rationale extraction method that leverages the reasoning capabilities of Large Language Models (LLMs) to extract the rationales necessary for answering the query. Subsequently, a rationale-based alignment process is designed to rerank the documents based on the extracted rationales, and fine-tune the reranker to align the preferences. We conduct extensive experiments on two tasks across three datasets to demonstrate the effectiveness of our approach compared to baseline methods. Our code is released online to ease reproduction. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.08519v1-abstract-full').style.display = 'none'; document.getElementById('2412.08519v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">under review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.07746">arXiv:2412.07746</a> <span> [<a href="https://arxiv.org/pdf/2412.07746">pdf</a>, <a href="https://arxiv.org/format/2412.07746">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> LoRA3D: Low-Rank Self-Calibration of 3D Geometric Foundation Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lu%2C+Z">Ziqi Lu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+H">Heng Yang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+D">Danfei Xu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Boyi Li</a>, <a href="/search/cs?searchtype=author&query=Ivanovic%2C+B">Boris Ivanovic</a>, <a href="/search/cs?searchtype=author&query=Pavone%2C+M">Marco Pavone</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yue Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.07746v1-abstract-short" style="display: inline;"> Emerging 3D geometric foundation models, such as DUSt3R, offer a promising approach for in-the-wild 3D vision tasks. However, due to the high-dimensional nature of the problem space and scarcity of high-quality 3D data, these pre-trained models still struggle to generalize to many challenging circumstances, such as limited view overlap or low lighting. To address this, we propose LoRA3D, an effici… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.07746v1-abstract-full').style.display = 'inline'; document.getElementById('2412.07746v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.07746v1-abstract-full" style="display: none;"> Emerging 3D geometric foundation models, such as DUSt3R, offer a promising approach for in-the-wild 3D vision tasks. However, due to the high-dimensional nature of the problem space and scarcity of high-quality 3D data, these pre-trained models still struggle to generalize to many challenging circumstances, such as limited view overlap or low lighting. To address this, we propose LoRA3D, an efficient self-calibration pipeline to $\textit{specialize}$ the pre-trained models to target scenes using their own multi-view predictions. Taking sparse RGB images as input, we leverage robust optimization techniques to refine multi-view predictions and align them into a global coordinate frame. In particular, we incorporate prediction confidence into the geometric optimization process, automatically re-weighting the confidence to better reflect point estimation accuracy. We use the calibrated confidence to generate high-quality pseudo labels for the calibrating views and use low-rank adaptation (LoRA) to fine-tune the models on the pseudo-labeled data. Our method does not require any external priors or manual labels. It completes the self-calibration process on a $\textbf{single standard GPU within just 5 minutes}$. Each low-rank adapter requires only $\textbf{18MB}$ of storage. We evaluated our method on $\textbf{more than 160 scenes}$ from the Replica, TUM and Waymo Open datasets, achieving up to $\textbf{88% performance improvement}$ on 3D reconstruction, multi-view pose estimation and novel-view rendering. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.07746v1-abstract-full').style.display = 'none'; document.getElementById('2412.07746v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.07094">arXiv:2412.07094</a> <span> [<a href="https://arxiv.org/pdf/2412.07094">pdf</a>, <a href="https://arxiv.org/format/2412.07094">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Access Point Deployment for Localizing Accuracy and User Rate in Cell-Free Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+F">Fanfei Xu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Shengheng Liu</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+Z">Zihuan Mao</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+S">Shangqing Shi</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+D">Dazhuan Xu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+D">Dongming Wang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yongming Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.07094v1-abstract-short" style="display: inline;"> Evolving next-generation mobile networks is designed to provide ubiquitous coverage and networked sensing. With utility of multi-view sensing and multi-node joint transmission, cell-free is a promising technique to realize this prospect. This paper aims to tackle the problem of access point (AP) deployment in cell-free systems to balance the sensing accuracy and user rate. By merging the D-optimal… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.07094v1-abstract-full').style.display = 'inline'; document.getElementById('2412.07094v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.07094v1-abstract-full" style="display: none;"> Evolving next-generation mobile networks is designed to provide ubiquitous coverage and networked sensing. With utility of multi-view sensing and multi-node joint transmission, cell-free is a promising technique to realize this prospect. This paper aims to tackle the problem of access point (AP) deployment in cell-free systems to balance the sensing accuracy and user rate. By merging the D-optimality with Euclidean criterion, a novel integrated metric is proposed to be the objective function for both max-sum and max-min problems, which respectively guarantee the overall and lowest performance in multi-user communication and target tracking scenario. To solve the corresponding high dimensional non-convex multi-objective problem, the Soft actor-critic (SAC) is utilized to avoid risk of local optimal result. Numerical results demonstrate that proposed SAC-based APs deployment method achieves $20\%$ of overall performance and $120\%$ of lowest performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.07094v1-abstract-full').style.display = 'none'; document.getElementById('2412.07094v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Presented at MobiCom 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.04468">arXiv:2412.04468</a> <span> [<a href="https://arxiv.org/pdf/2412.04468">pdf</a>, <a href="https://arxiv.org/format/2412.04468">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> NVILA: Efficient Frontier Visual Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhijian Liu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+L">Ligeng Zhu</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+B">Baifeng Shi</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhuoyang Zhang</a>, <a href="/search/cs?searchtype=author&query=Lou%2C+Y">Yuming Lou</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Shang Yang</a>, <a href="/search/cs?searchtype=author&query=Xi%2C+H">Haocheng Xi</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+S">Shiyi Cao</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+Y">Yuxian Gu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Dacheng Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiuyu Li</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Y">Yunhao Fang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yukang Chen</a>, <a href="/search/cs?searchtype=author&query=Hsieh%2C+C">Cheng-Yu Hsieh</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+D">De-An Huang</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+A">An-Chieh Cheng</a>, <a href="/search/cs?searchtype=author&query=Nath%2C+V">Vishwesh Nath</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+J">Jinyi Hu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Sifei Liu</a>, <a href="/search/cs?searchtype=author&query=Krishna%2C+R">Ranjay Krishna</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+D">Daguang Xu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiaolong Wang</a>, <a href="/search/cs?searchtype=author&query=Molchanov%2C+P">Pavlo Molchanov</a>, <a href="/search/cs?searchtype=author&query=Kautz%2C+J">Jan Kautz</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+H">Hongxu Yin</a> , et al. (2 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.04468v1-abstract-short" style="display: inline;"> Visual language models (VLMs) have made significant advances in accuracy in recent years. However, their efficiency has received much less attention. This paper introduces NVILA, a family of open VLMs designed to optimize both efficiency and accuracy. Building on top of VILA, we improve its model architecture by first scaling up the spatial and temporal resolutions, and then compressing visual tok… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.04468v1-abstract-full').style.display = 'inline'; document.getElementById('2412.04468v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.04468v1-abstract-full" style="display: none;"> Visual language models (VLMs) have made significant advances in accuracy in recent years. However, their efficiency has received much less attention. This paper introduces NVILA, a family of open VLMs designed to optimize both efficiency and accuracy. Building on top of VILA, we improve its model architecture by first scaling up the spatial and temporal resolutions, and then compressing visual tokens. This "scale-then-compress" approach enables NVILA to efficiently process high-resolution images and long videos. We also conduct a systematic investigation to enhance the efficiency of NVILA throughout its entire lifecycle, from training and fine-tuning to deployment. NVILA matches or surpasses the accuracy of many leading open and proprietary VLMs across a wide range of image and video benchmarks. At the same time, it reduces training costs by 4.5X, fine-tuning memory usage by 3.4X, pre-filling latency by 1.6-2.2X, and decoding latency by 1.2-2.8X. We will soon make our code and models available to facilitate reproducibility. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.04468v1-abstract-full').style.display = 'none'; document.getElementById('2412.04468v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.02906">arXiv:2412.02906</a> <span> [<a href="https://arxiv.org/pdf/2412.02906">pdf</a>, <a href="https://arxiv.org/format/2412.02906">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Does Few-Shot Learning Help LLM Performance in Code Synthesis? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+D">Derek Xu</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+T">Tong Xie</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+B">Botao Xia</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Haoyu Li</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+Y">Yunsheng Bai</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yizhou Sun</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wei Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.02906v1-abstract-short" style="display: inline;"> Large language models (LLMs) have made significant strides at code generation through improved model design, training, and chain-of-thought. However, prompt-level optimizations remain an important yet under-explored aspect of LLMs for coding. This work focuses on the few-shot examples present in most code generation prompts, offering a systematic study on whether few-shot examples improve LLM's co… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.02906v1-abstract-full').style.display = 'inline'; document.getElementById('2412.02906v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.02906v1-abstract-full" style="display: none;"> Large language models (LLMs) have made significant strides at code generation through improved model design, training, and chain-of-thought. However, prompt-level optimizations remain an important yet under-explored aspect of LLMs for coding. This work focuses on the few-shot examples present in most code generation prompts, offering a systematic study on whether few-shot examples improve LLM's coding capabilities, which few-shot examples have the largest impact, and how to select impactful examples. Our work offers 2 approaches for selecting few-shot examples, a model-free method, CODEEXEMPLAR-FREE, and a model-based method, CODEEXEMPLAR-BASED. The 2 methods offer a trade-off between improved performance and reliance on training data and interpretability. Both methods significantly improve CodeLlama's coding ability across the popular HumanEval+ coding benchmark. In summary, our work provides valuable insights into how to pick few-shot examples in code generation prompts to improve LLM code generation capabilities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.02906v1-abstract-full').style.display = 'none'; document.getElementById('2412.02906v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.00719">arXiv:2412.00719</a> <span> [<a href="https://arxiv.org/pdf/2412.00719">pdf</a>, <a href="https://arxiv.org/format/2412.00719">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Synergizing Motion and Appearance: Multi-Scale Compensatory Codebooks for Talking Head Video Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Shuling Zhao</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+F">Fa-Ting Hong</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+X">Xiaoshui Huang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+D">Dan Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.00719v1-abstract-short" style="display: inline;"> Talking head video generation aims to generate a realistic talking head video that preserves the person's identity from a source image and the motion from a driving video. Despite the promising progress made in the field, it remains a challenging and critical problem to generate videos with accurate poses and fine-grained facial details simultaneously. Essentially, facial motion is often highly co… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.00719v1-abstract-full').style.display = 'inline'; document.getElementById('2412.00719v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.00719v1-abstract-full" style="display: none;"> Talking head video generation aims to generate a realistic talking head video that preserves the person's identity from a source image and the motion from a driving video. Despite the promising progress made in the field, it remains a challenging and critical problem to generate videos with accurate poses and fine-grained facial details simultaneously. Essentially, facial motion is often highly complex to model precisely, and the one-shot source face image cannot provide sufficient appearance guidance during generation due to dynamic pose changes. To tackle the problem, we propose to jointly learn motion and appearance codebooks and perform multi-scale codebook compensation to effectively refine both the facial motion conditions and appearance features for talking face image decoding. Specifically, the designed multi-scale motion and appearance codebooks are learned simultaneously in a unified framework to store representative global facial motion flow and appearance patterns. Then, we present a novel multi-scale motion and appearance compensation module, which utilizes a transformer-based codebook retrieval strategy to query complementary information from the two codebooks for joint motion and appearance compensation. The entire process produces motion flows of greater flexibility and appearance features with fewer distortions across different scales, resulting in a high-quality talking head video generation framework. Extensive experiments on various benchmarks validate the effectiveness of our approach and demonstrate superior generation results from both qualitative and quantitative perspectives when compared to state-of-the-art competitors. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.00719v1-abstract-full').style.display = 'none'; document.getElementById('2412.00719v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://shaelynz.github.io/synergize-motion-appearance/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.19786">arXiv:2411.19786</a> <span> [<a href="https://arxiv.org/pdf/2411.19786">pdf</a>, <a href="https://arxiv.org/format/2411.19786">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> MoTe: Learning Motion-Text Diffusion Model for Multiple Generation Tasks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yiming Wu</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+W">Wei Ji</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+K">Kecheng Zheng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zicheng Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+D">Dong Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.19786v1-abstract-short" style="display: inline;"> Recently, human motion analysis has experienced great improvement due to inspiring generative models such as the denoising diffusion model and large language model. While the existing approaches mainly focus on generating motions with textual descriptions and overlook the reciprocal task. In this paper, we present~\textbf{MoTe}, a unified multi-modal model that could handle diverse tasks by learni… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.19786v1-abstract-full').style.display = 'inline'; document.getElementById('2411.19786v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.19786v1-abstract-full" style="display: none;"> Recently, human motion analysis has experienced great improvement due to inspiring generative models such as the denoising diffusion model and large language model. While the existing approaches mainly focus on generating motions with textual descriptions and overlook the reciprocal task. In this paper, we present~\textbf{MoTe}, a unified multi-modal model that could handle diverse tasks by learning the marginal, conditional, and joint distributions of motion and text simultaneously. MoTe enables us to handle the paired text-motion generation, motion captioning, and text-driven motion generation by simply modifying the input context. Specifically, MoTe is composed of three components: Motion Encoder-Decoder (MED), Text Encoder-Decoder (TED), and Moti-on-Text Diffusion Model (MTDM). In particular, MED and TED are trained for extracting latent embeddings, and subsequently reconstructing the motion sequences and textual descriptions from the extracted embeddings, respectively. MTDM, on the other hand, performs an iterative denoising process on the input context to handle diverse tasks. Experimental results on the benchmark datasets demonstrate the superior performance of our proposed method on text-to-motion generation and competitive performance on motion captioning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.19786v1-abstract-full').style.display = 'none'; document.getElementById('2411.19786v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Five figures, six tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.18823">arXiv:2411.18823</a> <span> [<a href="https://arxiv.org/pdf/2411.18823">pdf</a>, <a href="https://arxiv.org/format/2411.18823">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Multi-Task Label Discovery via Hierarchical Task Tokens for Partially Annotated Dense Predictions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jingdong Zhang</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+H">Hanrong Ye</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xin Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wenping Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+D">Dan Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.18823v1-abstract-short" style="display: inline;"> In recent years, simultaneous learning of multiple dense prediction tasks with partially annotated label data has emerged as an important research area. Previous works primarily focus on constructing cross-task consistency or conducting adversarial training to regularize cross-task predictions, which achieve promising performance improvements, while still suffering from the lack of direct pixel-wi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18823v1-abstract-full').style.display = 'inline'; document.getElementById('2411.18823v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.18823v1-abstract-full" style="display: none;"> In recent years, simultaneous learning of multiple dense prediction tasks with partially annotated label data has emerged as an important research area. Previous works primarily focus on constructing cross-task consistency or conducting adversarial training to regularize cross-task predictions, which achieve promising performance improvements, while still suffering from the lack of direct pixel-wise supervision for multi-task dense predictions. To tackle this challenge, we propose a novel approach to optimize a set of learnable hierarchical task tokens, including global and fine-grained ones, to discover consistent pixel-wise supervision signals in both feature and prediction levels. Specifically, the global task tokens are designed for effective cross-task feature interactions in a global context. Then, a group of fine-grained task-specific spatial tokens for each task is learned from the corresponding global task tokens. It is embedded to have dense interactions with each task-specific feature map. The learned global and local fine-grained task tokens are further used to discover pseudo task-specific dense labels at different levels of granularity, and they can be utilized to directly supervise the learning of the multi-task dense prediction framework. Extensive experimental results on challenging NYUD-v2, Cityscapes, and PASCAL Context datasets demonstrate significant improvements over existing state-of-the-art methods for partially annotated multi-task dense prediction. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18823v1-abstract-full').style.display = 'none'; document.getElementById('2411.18823v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.18473">arXiv:2411.18473</a> <span> [<a href="https://arxiv.org/pdf/2411.18473">pdf</a>, <a href="https://arxiv.org/format/2411.18473">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> HEMGS: A Hybrid Entropy Model for 3D Gaussian Splatting Data Compression </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+L">Lei Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhenghao Chen</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+D">Dong Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.18473v1-abstract-short" style="display: inline;"> Fast progress in 3D Gaussian Splatting (3DGS) has made 3D Gaussians popular for 3D modeling and image rendering, but this creates big challenges in data storage and transmission. To obtain a highly compact 3DGS representation, we propose a hybrid entropy model for Gaussian Splatting (HEMGS) data compression, which comprises two primary components, a hyperprior network and an autoregressive network… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18473v1-abstract-full').style.display = 'inline'; document.getElementById('2411.18473v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.18473v1-abstract-full" style="display: none;"> Fast progress in 3D Gaussian Splatting (3DGS) has made 3D Gaussians popular for 3D modeling and image rendering, but this creates big challenges in data storage and transmission. To obtain a highly compact 3DGS representation, we propose a hybrid entropy model for Gaussian Splatting (HEMGS) data compression, which comprises two primary components, a hyperprior network and an autoregressive network. To effectively reduce structural redundancy across attributes, we apply a progressive coding algorithm to generate hyperprior features, in which we use previously compressed attributes and location as prior information. In particular, to better extract the location features from these compressed attributes, we adopt a domain-aware and instance-aware architecture to respectively capture domain-aware structural relations without additional storage costs and reveal scene-specific features through MLPs. Additionally, to reduce redundancy within each attribute, we leverage relationships between neighboring compressed elements within the attributes through an autoregressive network. Given its unique structure, we propose an adaptive context coding algorithm with flexible receptive fields to effectively capture adjacent compressed elements. Overall, we integrate our HEMGS into an end-to-end optimized 3DGS compression framework and the extensive experimental results on four benchmarks indicate that our method achieves about 40\% average reduction in size while maintaining the rendering quality over our baseline method and achieving state-of-the-art compression results. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18473v1-abstract-full').style.display = 'none'; document.getElementById('2411.18473v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.18375">arXiv:2411.18375</a> <span> [<a href="https://arxiv.org/pdf/2411.18375">pdf</a>, <a href="https://arxiv.org/format/2411.18375">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Individual Content and Motion Dynamics Preserved Pruning for Video Diffusion Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yiming Wu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Huan Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhenghao Chen</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+D">Dong Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.18375v1-abstract-short" style="display: inline;"> The high computational cost and slow inference time are major obstacles to deploying the video diffusion model (VDM) in practical applications. To overcome this, we introduce a new Video Diffusion Model Compression approach using individual content and motion dynamics preserved pruning and consistency loss. First, we empirically observe that deeper VDM layers are crucial for maintaining the qualit… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18375v1-abstract-full').style.display = 'inline'; document.getElementById('2411.18375v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.18375v1-abstract-full" style="display: none;"> The high computational cost and slow inference time are major obstacles to deploying the video diffusion model (VDM) in practical applications. To overcome this, we introduce a new Video Diffusion Model Compression approach using individual content and motion dynamics preserved pruning and consistency loss. First, we empirically observe that deeper VDM layers are crucial for maintaining the quality of \textbf{motion dynamics} e.g., coherence of the entire video, while shallower layers are more focused on \textbf{individual content} e.g., individual frames. Therefore, we prune redundant blocks from the shallower layers while preserving more of the deeper layers, resulting in a lightweight VDM variant called VDMini. Additionally, we propose an \textbf{Individual Content and Motion Dynamics (ICMD)} Consistency Loss to gain comparable generation performance as larger VDM, i.e., the teacher to VDMini i.e., the student. Particularly, we first use the Individual Content Distillation (ICD) Loss to ensure consistency in the features of each generated frame between the teacher and student models. Next, we introduce a Multi-frame Content Adversarial (MCA) Loss to enhance the motion dynamics across the generated video as a whole. This method significantly accelerates inference time while maintaining high-quality video generation. Extensive experiments demonstrate the effectiveness of our VDMini on two important video generation tasks, Text-to-Video (T2V) and Image-to-Video (I2V), where we respectively achieve an average 2.5 $\times$ and 1.4 $\times$ speed up for the I2V method SF-V and the T2V method T2V-Turbo-v2, while maintaining the quality of the generated videos on two benchmarks, i.e., UCF101 and VBench. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18375v1-abstract-full').style.display = 'none'; document.getElementById('2411.18375v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 figures, 9 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.17832">arXiv:2411.17832</a> <span> [<a href="https://arxiv.org/pdf/2411.17832">pdf</a>, <a href="https://arxiv.org/format/2411.17832">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> SVGDreamer++: Advancing Editability and Diversity in Text-Guided SVG Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xing%2C+X">Ximing Xing</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Q">Qian Yu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chuang Wang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+H">Haitao Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jing Zhang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+D">Dong Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.17832v2-abstract-short" style="display: inline;"> Recently, text-guided scalable vector graphics (SVG) synthesis has demonstrated significant potential in domains such as iconography and sketching. However, SVGs generated from existing Text-to-SVG methods often lack editability and exhibit deficiencies in visual quality and diversity. In this paper, we propose a novel text-guided vector graphics synthesis method to address these limitations. To e… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17832v2-abstract-full').style.display = 'inline'; document.getElementById('2411.17832v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.17832v2-abstract-full" style="display: none;"> Recently, text-guided scalable vector graphics (SVG) synthesis has demonstrated significant potential in domains such as iconography and sketching. However, SVGs generated from existing Text-to-SVG methods often lack editability and exhibit deficiencies in visual quality and diversity. In this paper, we propose a novel text-guided vector graphics synthesis method to address these limitations. To enhance the editability of output SVGs, we introduce a Hierarchical Image VEctorization (HIVE) framework that operates at the semantic object level and supervises the optimization of components within the vector object. This approach facilitates the decoupling of vector graphics into distinct objects and component levels. Our proposed HIVE algorithm, informed by image segmentation priors, not only ensures a more precise representation of vector graphics but also enables fine-grained editing capabilities within vector objects. To improve the diversity of output SVGs, we present a Vectorized Particle-based Score Distillation (VPSD) approach. VPSD addresses over-saturation issues in existing methods and enhances sample diversity. A pre-trained reward model is incorporated to re-weight vector particles, improving aesthetic appeal and enabling faster convergence. Additionally, we design a novel adaptive vector primitives control strategy, which allows for the dynamic adjustment of the number of primitives, thereby enhancing the presentation of graphic details. Extensive experiments validate the effectiveness of the proposed method, demonstrating its superiority over baseline methods in terms of editability, visual quality, and diversity. We also show that our new method supports up to six distinct vector styles, capable of generating high-quality vector assets suitable for stylized vector design and poster design. Code and demo will be released at: http://ximinng.github.io/SVGDreamerV2Project/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17832v2-abstract-full').style.display = 'none'; document.getElementById('2411.17832v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages, 17 figures. Project Page: http://ximinng.github.io/SVGDreamerV2Project/. arXiv admin note: text overlap with arXiv:2312.16476</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.17182">arXiv:2411.17182</a> <span> [<a href="https://arxiv.org/pdf/2411.17182">pdf</a>, <a href="https://arxiv.org/format/2411.17182">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> An In-depth Investigation of Sparse Rate Reduction in Transformer-like Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yunzhe Hu</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+D">Difan Zou</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+D">Dong Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.17182v1-abstract-short" style="display: inline;"> Deep neural networks have long been criticized for being black-box. To unveil the inner workings of modern neural architectures, a recent work \cite{yu2024white} proposed an information-theoretic objective function called Sparse Rate Reduction (SRR) and interpreted its unrolled optimization as a Transformer-like model called Coding Rate Reduction Transformer (CRATE). However, the focus of the stud… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17182v1-abstract-full').style.display = 'inline'; document.getElementById('2411.17182v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.17182v1-abstract-full" style="display: none;"> Deep neural networks have long been criticized for being black-box. To unveil the inner workings of modern neural architectures, a recent work \cite{yu2024white} proposed an information-theoretic objective function called Sparse Rate Reduction (SRR) and interpreted its unrolled optimization as a Transformer-like model called Coding Rate Reduction Transformer (CRATE). However, the focus of the study was primarily on the basic implementation, and whether this objective is optimized in practice and its causal relationship to generalization remain elusive. Going beyond this study, we derive different implementations by analyzing layer-wise behaviors of CRATE, both theoretically and empirically. To reveal the predictive power of SRR on generalization, we collect a set of model variants induced by varied implementations and hyperparameters and evaluate SRR as a complexity measure based on its correlation with generalization. Surprisingly, we find out that SRR has a positive correlation coefficient and outperforms other baseline measures, such as path-norm and sharpness-based ones. Furthermore, we show that generalization can be improved using SRR as regularization on benchmark image classification datasets. We hope this paper can shed light on leveraging SRR to design principled models and study their generalization ability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17182v1-abstract-full').style.display = 'none'; document.getElementById('2411.17182v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.16740">arXiv:2411.16740</a> <span> [<a href="https://arxiv.org/pdf/2411.16740">pdf</a>, <a href="https://arxiv.org/format/2411.16740">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Document Haystacks: Vision-Language Reasoning Over Piles of 1000+ Documents </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jun Chen</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+D">Dannong Xu</a>, <a href="/search/cs?searchtype=author&query=Fei%2C+J">Junjie Fei</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+C">Chun-Mei Feng</a>, <a href="/search/cs?searchtype=author&query=Elhoseiny%2C+M">Mohamed Elhoseiny</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.16740v3-abstract-short" style="display: inline;"> Large multimodal models (LMMs) have achieved impressive progress in vision-language understanding, yet they face limitations in real-world applications requiring complex reasoning over a large number of images. Existing benchmarks for multi-image question-answering are limited in scope, each question is paired with only up to 30 images, which does not fully capture the demands of large-scale retri… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16740v3-abstract-full').style.display = 'inline'; document.getElementById('2411.16740v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.16740v3-abstract-full" style="display: none;"> Large multimodal models (LMMs) have achieved impressive progress in vision-language understanding, yet they face limitations in real-world applications requiring complex reasoning over a large number of images. Existing benchmarks for multi-image question-answering are limited in scope, each question is paired with only up to 30 images, which does not fully capture the demands of large-scale retrieval tasks encountered in the real-world usages. To reduce these gaps, we introduce two document haystack benchmarks, dubbed DocHaystack and InfoHaystack, designed to evaluate LMM performance on large-scale visual document retrieval and understanding. Additionally, we propose V-RAG, a novel, vision-centric retrieval-augmented generation (RAG) framework that leverages a suite of multimodal vision encoders, each optimized for specific strengths, and a dedicated question-document relevance module. V-RAG sets a new standard, with a 9% and 11% improvement in Recall@1 on the challenging DocHaystack-1000 and InfoHaystack-1000 benchmarks, respectively, compared to the previous best baseline models. Additionally, integrating V-RAG with LMMs enables them to efficiently operate across thousands of images, yielding significant improvements on our DocHaystack and InfoHaystack benchmarks. Our code and datasets are available at https://github.com/Vision-CAIR/dochaystacks <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16740v3-abstract-full').style.display = 'none'; document.getElementById('2411.16740v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">the correct arxiv version</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.16686">arXiv:2411.16686</a> <span> [<a href="https://arxiv.org/pdf/2411.16686">pdf</a>, <a href="https://arxiv.org/format/2411.16686">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Biomolecules">q-bio.BM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> ProteinWeaver: A Divide-and-Assembly Approach for Protein Backbone Design </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ma%2C+Y">Yiming Ma</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+F">Fei Ye</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yi Zhou</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Z">Zaixiang Zheng</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+D">Dongyu Xue</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+Q">Quanquan Gu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.16686v2-abstract-short" style="display: inline;"> Nature creates diverse proteins through a 'divide and assembly' strategy. Inspired by this idea, we introduce ProteinWeaver, a two-stage framework for protein backbone design. Our method first generates individual protein domains and then employs an SE(3) diffusion model to flexibly assemble these domains. A key challenge lies in the assembling step, given the complex and rugged nature of the inte… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16686v2-abstract-full').style.display = 'inline'; document.getElementById('2411.16686v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.16686v2-abstract-full" style="display: none;"> Nature creates diverse proteins through a 'divide and assembly' strategy. Inspired by this idea, we introduce ProteinWeaver, a two-stage framework for protein backbone design. Our method first generates individual protein domains and then employs an SE(3) diffusion model to flexibly assemble these domains. A key challenge lies in the assembling step, given the complex and rugged nature of the inter-domain interaction landscape. To address this challenge, we employ preference alignment to discern complex relationships between structure and interaction landscapes through comparative analysis of generated samples. Comprehensive experiments demonstrate that ProteinWeaver: (1) generates high-quality, novel protein backbones through versatile domain assembly; (2) outperforms RFdiffusion, the current state-of-the-art in backbone design, by 13\% and 39\% for long-chain proteins; (3) shows the potential for cooperative function design through illustrative case studies. To sum up, by introducing a `divide-and-assembly' paradigm, ProteinWeaver advances protein engineering and opens new avenues for functional protein design. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16686v2-abstract-full').style.display = 'none'; document.getElementById('2411.16686v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">19 pages, 10 figures, 3 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.16239">arXiv:2411.16239</a> <span> [<a href="https://arxiv.org/pdf/2411.16239">pdf</a>, <a href="https://arxiv.org/format/2411.16239">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> CS-Eval: A Comprehensive Large Language Model Benchmark for CyberSecurity </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+Z">Zhengmin Yu</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+J">Jiutian Zeng</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Siyi Chen</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+W">Wenhan Xu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+D">Dandan Xu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiangyu Liu</a>, <a href="/search/cs?searchtype=author&query=Ying%2C+Z">Zonghao Ying</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+N">Nan Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+M">Min Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.16239v3-abstract-short" style="display: inline;"> Over the past year, there has been a notable rise in the use of large language models (LLMs) for academic research and industrial practices within the cybersecurity field. However, it remains a lack of comprehensive and publicly accessible benchmarks to evaluate the performance of LLMs on cybersecurity tasks. To address this gap, we introduce CS-Eval, a publicly accessible, comprehensive and bilin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16239v3-abstract-full').style.display = 'inline'; document.getElementById('2411.16239v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.16239v3-abstract-full" style="display: none;"> Over the past year, there has been a notable rise in the use of large language models (LLMs) for academic research and industrial practices within the cybersecurity field. However, it remains a lack of comprehensive and publicly accessible benchmarks to evaluate the performance of LLMs on cybersecurity tasks. To address this gap, we introduce CS-Eval, a publicly accessible, comprehensive and bilingual LLM benchmark specifically designed for cybersecurity. CS-Eval synthesizes the research hotspots from academia and practical applications from industry, curating a diverse set of high-quality questions across 42 categories within cybersecurity, systematically organized into three cognitive levels: knowledge, ability, and application. Through an extensive evaluation of a wide range of LLMs using CS-Eval, we have uncovered valuable insights. For instance, while GPT-4 generally excels overall, other models may outperform it in certain specific subcategories. Additionally, by conducting evaluations over several months, we observed significant improvements in many LLMs' abilities to solve cybersecurity tasks. The benchmarks are now publicly available at https://github.com/CS-EVAL/CS-Eval. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16239v3-abstract-full').style.display = 'none'; document.getElementById('2411.16239v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15558">arXiv:2411.15558</a> <span> [<a href="https://arxiv.org/pdf/2411.15558">pdf</a>, <a href="https://arxiv.org/format/2411.15558">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Reassessing Layer Pruning in LLMs: New Insights and Methods </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lu%2C+Y">Yao Lu</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+H">Hao Cheng</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Y">Yujie Fang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zeyu Wang</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+J">Jiaheng Wei</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+D">Dongwei Xu</a>, <a href="/search/cs?searchtype=author&query=Xuan%2C+Q">Qi Xuan</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xiaoniu Yang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Z">Zhaowei Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15558v1-abstract-short" style="display: inline;"> Although large language models (LLMs) have achieved remarkable success across various domains, their considerable scale necessitates substantial computational resources, posing significant challenges for deployment in resource-constrained environments. Layer pruning, as a simple yet effective compression method, removes layers of a model directly, reducing computational overhead. However, what are… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15558v1-abstract-full').style.display = 'inline'; document.getElementById('2411.15558v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15558v1-abstract-full" style="display: none;"> Although large language models (LLMs) have achieved remarkable success across various domains, their considerable scale necessitates substantial computational resources, posing significant challenges for deployment in resource-constrained environments. Layer pruning, as a simple yet effective compression method, removes layers of a model directly, reducing computational overhead. However, what are the best practices for layer pruning in LLMs? Are sophisticated layer selection metrics truly effective? Does the LoRA (Low-Rank Approximation) family, widely regarded as a leading method for pruned model fine-tuning, truly meet expectations when applied to post-pruning fine-tuning? To answer these questions, we dedicate thousands of GPU hours to benchmarking layer pruning in LLMs and gaining insights across multiple dimensions. Our results demonstrate that a simple approach, i.e., pruning the final 25\% of layers followed by fine-tuning the \texttt{lm\_head} and the remaining last three layer, yields remarkably strong performance. Following this guide, we prune Llama-3.1-8B-It and obtain a model that outperforms many popular LLMs of similar size, such as ChatGLM2-6B, Vicuna-7B-v1.5, Qwen1.5-7B and Baichuan2-7B. We release the optimal model weights on Huggingface, and the code is available on GitHub. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15558v1-abstract-full').style.display = 'none'; document.getElementById('2411.15558v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15276">arXiv:2411.15276</a> <span> [<a href="https://arxiv.org/pdf/2411.15276">pdf</a>, <a href="https://arxiv.org/format/2411.15276">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Event USKT : U-State Space Model in Knowledge Transfer for Event Cameras </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yuhui Lin</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiahao Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Siyuan Li</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+J">Jimin Xiao</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+D">Ding Xu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+W">Wenjun Wu</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+J">Jiaxuan Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15276v1-abstract-short" style="display: inline;"> Event cameras, as an emerging imaging technology, offer distinct advantages over traditional RGB cameras, including reduced energy consumption and higher frame rates. However, the limited quantity of available event data presents a significant challenge, hindering their broader development. To alleviate this issue, we introduce a tailored U-shaped State Space Model Knowledge Transfer (USKT) framew… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15276v1-abstract-full').style.display = 'inline'; document.getElementById('2411.15276v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15276v1-abstract-full" style="display: none;"> Event cameras, as an emerging imaging technology, offer distinct advantages over traditional RGB cameras, including reduced energy consumption and higher frame rates. However, the limited quantity of available event data presents a significant challenge, hindering their broader development. To alleviate this issue, we introduce a tailored U-shaped State Space Model Knowledge Transfer (USKT) framework for Event-to-RGB knowledge transfer. This framework generates inputs compatible with RGB frames, enabling event data to effectively reuse pre-trained RGB models and achieve competitive performance with minimal parameter tuning. Within the USKT architecture, we also propose a bidirectional reverse state space model. Unlike conventional bidirectional scanning mechanisms, the proposed Bidirectional Reverse State Space Model (BiR-SSM) leverages a shared weight strategy, which facilitates efficient modeling while conserving computational resources. In terms of effectiveness, integrating USKT with ResNet50 as the backbone improves model performance by 0.95%, 3.57%, and 2.9% on DVS128 Gesture, N-Caltech101, and CIFAR-10-DVS datasets, respectively, underscoring USKT's adaptability and effectiveness. The code will be made available upon acceptance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15276v1-abstract-full').style.display = 'none'; document.getElementById('2411.15276v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14514">arXiv:2411.14514</a> <span> [<a href="https://arxiv.org/pdf/2411.14514">pdf</a>, <a href="https://arxiv.org/format/2411.14514">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> NexusSplats: Efficient 3D Gaussian Splatting in the Wild </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tang%2C+Y">Yuzhou Tang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+D">Dejun Xu</a>, <a href="/search/cs?searchtype=author&query=Hou%2C+Y">Yongjie Hou</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhenzhong Wang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+M">Min Jiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14514v4-abstract-short" style="display: inline;"> While 3D Gaussian Splatting (3DGS) has recently demonstrated remarkable rendering quality and efficiency in 3D scene reconstruction, it struggles with varying lighting conditions and incidental occlusions in real-world scenarios. To accommodate varying lighting conditions, existing 3DGS extensions apply color mapping to the massive Gaussian primitives with individually optimized appearance embeddi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14514v4-abstract-full').style.display = 'inline'; document.getElementById('2411.14514v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14514v4-abstract-full" style="display: none;"> While 3D Gaussian Splatting (3DGS) has recently demonstrated remarkable rendering quality and efficiency in 3D scene reconstruction, it struggles with varying lighting conditions and incidental occlusions in real-world scenarios. To accommodate varying lighting conditions, existing 3DGS extensions apply color mapping to the massive Gaussian primitives with individually optimized appearance embeddings. To handle occlusions, they predict pixel-wise uncertainties via 2D image features for occlusion capture. Nevertheless, such massive color mapping and pixel-wise uncertainty prediction strategies suffer from not only additional computational costs but also coarse-grained lighting and occlusion handling. In this work, we propose a nexus kernel-driven approach, termed NexusSplats, for efficient and finer 3D scene reconstruction under complex lighting and occlusion conditions. In particular, NexusSplats leverages a novel light decoupling strategy where appearance embeddings are optimized based on nexus kernels instead of massive Gaussian primitives, thus accelerating reconstruction speeds while ensuring local color consistency for finer textures. Additionally, a Gaussian-wise uncertainty mechanism is developed, aligning 3D structures with 2D image features for fine-grained occlusion handling. Experimental results demonstrate that NexusSplats achieves state-of-the-art rendering quality while reducing reconstruction time by up to 70.4% compared to the current best in quality. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14514v4-abstract-full').style.display = 'none'; document.getElementById('2411.14514v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://nexus-splats.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14158">arXiv:2411.14158</a> <span> [<a href="https://arxiv.org/pdf/2411.14158">pdf</a>, <a href="https://arxiv.org/format/2411.14158">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Point Cloud Denoising With Fine-Granularity Dynamic Graph Convolutional Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+W">Wenqiang Xu</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+W">Wenrui Dai</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+D">Duoduo Xue</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Z">Ziyang Zheng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chenglin Li</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+J">Junni Zou</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+H">Hongkai Xiong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14158v1-abstract-short" style="display: inline;"> Due to limitations in acquisition equipment, noise perturbations often corrupt 3-D point clouds, hindering down-stream tasks such as surface reconstruction, rendering, and further processing. Existing 3-D point cloud denoising methods typically fail to reliably fit the underlying continuous surface, resulting in a degradation of reconstruction performance. This paper introduces fine-granularity dy… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14158v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14158v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14158v1-abstract-full" style="display: none;"> Due to limitations in acquisition equipment, noise perturbations often corrupt 3-D point clouds, hindering down-stream tasks such as surface reconstruction, rendering, and further processing. Existing 3-D point cloud denoising methods typically fail to reliably fit the underlying continuous surface, resulting in a degradation of reconstruction performance. This paper introduces fine-granularity dynamic graph convolutional networks called GD-GCN, a novel approach to denoising in 3-D point clouds. The GD-GCN employs micro-step temporal graph convolution (MST-GConv) to perform feature learning in a gradual manner. Compared with the conventional GCN, which commonly uses discrete integer-step graph convolution, this modification introduces a more adaptable and nuanced approach to feature learning within graph convolution networks. It more accurately depicts the process of fitting the point cloud with noise to the underlying surface by and the learning process for MST-GConv acts like a changing system and is managed through a type of neural network known as neural Partial Differential Equations (PDEs). This means it can adapt and improve over time. GD-GCN approximates the Riemannian metric, calculating distances between points along a low-dimensional manifold. This capability allows it to understand the local geometric structure and effectively capture diverse relationships between points from different geometric regions through geometric graph construction based on Riemannian distances. Additionally, GD-GCN incorporates robust graph spectral filters based on the Bernstein polynomial approximation, which modulate eigenvalues for complex and arbitrary spectral responses, providing theoretical guarantees for BIBO stability. Symmetric channel mixing matrices further enhance filter flexibility by enabling channel-level scaling and shifting in the spectral domain. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14158v1-abstract-full').style.display = 'none'; document.getElementById('2411.14158v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14120">arXiv:2411.14120</a> <span> [<a href="https://arxiv.org/pdf/2411.14120">pdf</a>, <a href="https://arxiv.org/format/2411.14120">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Point Cloud Resampling with Learnable Heat Diffusion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+W">Wenqiang Xu</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+W">Wenrui Dai</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+D">Duoduo Xue</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Z">Ziyang Zheng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chenglin Li</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+J">Junni Zou</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+H">Hongkai Xiong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14120v1-abstract-short" style="display: inline;"> Generative diffusion models have shown empirical successes in point cloud resampling, generating a denser and more uniform distribution of points from sparse or noisy 3D point clouds by progressively refining noise into structure. However, existing diffusion models employ manually predefined schemes, which often fail to recover the underlying point cloud structure due to the rigid and disruptive n… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14120v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14120v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14120v1-abstract-full" style="display: none;"> Generative diffusion models have shown empirical successes in point cloud resampling, generating a denser and more uniform distribution of points from sparse or noisy 3D point clouds by progressively refining noise into structure. However, existing diffusion models employ manually predefined schemes, which often fail to recover the underlying point cloud structure due to the rigid and disruptive nature of the geometric degradation. To address this issue, we propose a novel learnable heat diffusion framework for point cloud resampling, which directly parameterizes the marginal distribution for the forward process by learning the adaptive heat diffusion schedules and local filtering scales of the time-varying heat kernel, and consequently, generates an adaptive conditional prior for the reverse process. Unlike previous diffusion models with a fixed prior, the adaptive conditional prior selectively preserves geometric features of the point cloud by minimizing a refined variational lower bound, guiding the points to evolve towards the underlying surface during the reverse process. Extensive experimental results demonstrate that the proposed point cloud resampling achieves state-of-the-art performance in representative reconstruction tasks including point cloud denoising and upsampling. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14120v1-abstract-full').style.display = 'none'; document.getElementById('2411.14120v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12915">arXiv:2411.12915</a> <span> [<a href="https://arxiv.org/pdf/2411.12915">pdf</a>, <a href="https://arxiv.org/format/2411.12915">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> VILA-M3: Enhancing Vision-Language Models with Medical Expert Knowledge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Nath%2C+V">Vishwesh Nath</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wenqi Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+D">Dong Yang</a>, <a href="/search/cs?searchtype=author&query=Myronenko%2C+A">Andriy Myronenko</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+M">Mingxin Zheng</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Y">Yao Lu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhijian Liu</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+H">Hongxu Yin</a>, <a href="/search/cs?searchtype=author&query=Law%2C+Y+M">Yee Man Law</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Y">Yucheng Tang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+P">Pengfei Guo</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+C">Can Zhao</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Ziyue Xu</a>, <a href="/search/cs?searchtype=author&query=He%2C+Y">Yufan He</a>, <a href="/search/cs?searchtype=author&query=Heinrich%2C+G">Greg Heinrich</a>, <a href="/search/cs?searchtype=author&query=Aylward%2C+S">Stephen Aylward</a>, <a href="/search/cs?searchtype=author&query=Edgar%2C+M">Marc Edgar</a>, <a href="/search/cs?searchtype=author&query=Zephyr%2C+M">Michael Zephyr</a>, <a href="/search/cs?searchtype=author&query=Molchanov%2C+P">Pavlo Molchanov</a>, <a href="/search/cs?searchtype=author&query=Turkbey%2C+B">Baris Turkbey</a>, <a href="/search/cs?searchtype=author&query=Roth%2C+H">Holger Roth</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+D">Daguang Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12915v1-abstract-short" style="display: inline;"> Generalist vision language models (VLMs) have made significant strides in computer vision, but they fall short in specialized fields like healthcare, where expert knowledge is essential. In traditional computer vision tasks, creative or approximate answers may be acceptable, but in healthcare, precision is paramount.Current large multimodal models like Gemini and GPT-4o are insufficient for medica… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12915v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12915v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12915v1-abstract-full" style="display: none;"> Generalist vision language models (VLMs) have made significant strides in computer vision, but they fall short in specialized fields like healthcare, where expert knowledge is essential. In traditional computer vision tasks, creative or approximate answers may be acceptable, but in healthcare, precision is paramount.Current large multimodal models like Gemini and GPT-4o are insufficient for medical tasks due to their reliance on memorized internet knowledge rather than the nuanced expertise required in healthcare. VLMs are usually trained in three stages: vision pre-training, vision-language pre-training, and instruction fine-tuning (IFT). IFT has been typically applied using a mixture of generic and healthcare data. In contrast, we propose that for medical VLMs, a fourth stage of specialized IFT is necessary, which focuses on medical data and includes information from domain expert models. Domain expert models developed for medical use are crucial because they are specifically trained for certain clinical tasks, e.g. to detect tumors and classify abnormalities through segmentation and classification, which learn fine-grained features of medical data$-$features that are often too intricate for a VLM to capture effectively especially in radiology. This paper introduces a new framework, VILA-M3, for medical VLMs that utilizes domain knowledge via expert models. Through our experiments, we show an improved state-of-the-art (SOTA) performance with an average improvement of ~9% over the prior SOTA model Med-Gemini and ~6% over models trained on the specific tasks. Our approach emphasizes the importance of domain expertise in creating precise, reliable VLMs for medical applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12915v1-abstract-full').style.display = 'none'; document.getElementById('2411.12915v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Xu%2C+D&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Xu%2C+D&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Xu%2C+D&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Xu%2C+D&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Xu%2C+D&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Xu%2C+D&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>