Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 522 results for author: <span class="mathjax">Huang, D</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Huang%2C+D">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Huang, D"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Huang%2C+D&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Huang, D"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Huang%2C+D&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Huang%2C+D&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Huang%2C+D&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Huang%2C+D&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Huang%2C+D&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Huang%2C+D&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07238">arXiv:2502.07238</a> <span> [<a href="https://arxiv.org/pdf/2502.07238">pdf</a>, <a href="https://arxiv.org/format/2502.07238">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Diffusion Suction Grasping with Large-Scale Parcel Dataset </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+D">Ding-Tao Huang</a>, <a href="/search/cs?searchtype=author&query=He%2C+X">Xinyi He</a>, <a href="/search/cs?searchtype=author&query=Hua%2C+D">Debei Hua</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+D">Dongfang Yu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+E">En-Te Lin</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+L">Long Zeng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07238v1-abstract-short" style="display: inline;"> While recent advances in object suction grasping have shown remarkable progress, significant challenges persist particularly in cluttered and complex parcel handling scenarios. Two fundamental limitations hinder current approaches: (1) the lack of a comprehensive suction grasp dataset tailored for parcel manipulation tasks, and (2) insufficient adaptability to diverse object characteristics includ… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07238v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07238v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07238v1-abstract-full" style="display: none;"> While recent advances in object suction grasping have shown remarkable progress, significant challenges persist particularly in cluttered and complex parcel handling scenarios. Two fundamental limitations hinder current approaches: (1) the lack of a comprehensive suction grasp dataset tailored for parcel manipulation tasks, and (2) insufficient adaptability to diverse object characteristics including size variations, geometric complexity, and textural diversity. To address these challenges, we present Parcel-Suction-Dataset, a large-scale synthetic dataset containing 25 thousand cluttered scenes with 410 million precision-annotated suction grasp poses. This dataset is generated through our novel geometric sampling algorithm that enables efficient generation of optimal suction grasps incorporating both physical constraints and material properties. We further propose Diffusion-Suction, an innovative framework that reformulates suction grasp prediction as a conditional generation task through denoising diffusion probabilistic models. Our method iteratively refines random noise into suction grasp score maps through visual-conditioned guidance from point cloud observations, effectively learning spatial point-wise affordances from our synthetic dataset. Extensive experiments demonstrate that the simple yet efficient Diffusion-Suction achieves new state-of-the-art performance compared to previous models on both Parcel-Suction-Dataset and the public SuctionNet-1Billion benchmark. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07238v1-abstract-full').style.display = 'none'; document.getElementById('2502.07238v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06215">arXiv:2502.06215</a> <span> [<a href="https://arxiv.org/pdf/2502.06215">pdf</a>, <a href="https://arxiv.org/format/2502.06215">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> LessLeak-Bench: A First Investigation of Data Leakage in LLMs Across 83 Software Engineering Benchmarks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+X">Xin Zhou</a>, <a href="/search/cs?searchtype=author&query=Weyssow%2C+M">Martin Weyssow</a>, <a href="/search/cs?searchtype=author&query=Widyasari%2C+R">Ratnadira Widyasari</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Ting Zhang</a>, <a href="/search/cs?searchtype=author&query=He%2C+J">Junda He</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+Y">Yunbo Lyu</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+J">Jianming Chang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Beiqi Zhang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+D">Dan Huang</a>, <a href="/search/cs?searchtype=author&query=Lo%2C+D">David Lo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06215v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) are widely utilized in software engineering (SE) tasks, such as code generation and automated program repair. However, their reliance on extensive and often undisclosed pre-training datasets raises significant concerns about data leakage, where the evaluation benchmark data is unintentionally ``seen'' by LLMs during the model's construction phase. The data leakage issu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06215v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06215v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06215v1-abstract-full" style="display: none;"> Large Language Models (LLMs) are widely utilized in software engineering (SE) tasks, such as code generation and automated program repair. However, their reliance on extensive and often undisclosed pre-training datasets raises significant concerns about data leakage, where the evaluation benchmark data is unintentionally ``seen'' by LLMs during the model's construction phase. The data leakage issue could largely undermine the validity of LLM-based research and evaluations. Despite the increasing use of LLMs in the SE community, there is no comprehensive study that assesses the extent of data leakage in SE benchmarks for LLMs yet. To address this gap, this paper presents the first large-scale analysis of data leakage in 83 SE benchmarks concerning LLMs. Our results show that in general, data leakage in SE benchmarks is minimal, with average leakage ratios of only 4.8\%, 2.8\%, and 0.7\% for Python, Java, and C/C++ benchmarks, respectively. However, some benchmarks exhibit relatively higher leakage ratios, which raises concerns about their bias in evaluation. For instance, QuixBugs and BigCloneBench have leakage ratios of 100.0\% and 55.7\%, respectively. Furthermore, we observe that data leakage has a substantial impact on LLM evaluation. We also identify key causes of high data leakage, such as the direct inclusion of benchmark data in pre-training datasets and the use of coding platforms like LeetCode for benchmark construction. To address the data leakage, we introduce \textbf{LessLeak-Bench}, a new benchmark that removes leaked samples from the 83 SE benchmarks, enabling more reliable LLM evaluations in future research. Our study enhances the understanding of data leakage in SE benchmarks and provides valuable insights for future research involving LLMs in SE. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06215v1-abstract-full').style.display = 'none'; document.getElementById('2502.06215v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">25 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05178">arXiv:2502.05178</a> <span> [<a href="https://arxiv.org/pdf/2502.05178">pdf</a>, <a href="https://arxiv.org/format/2502.05178">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> QLIP: Text-Aligned Visual Tokenization Unifies Auto-Regressive Multimodal Understanding and Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yue Zhao</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+F">Fuzhao Xue</a>, <a href="/search/cs?searchtype=author&query=Reed%2C+S">Scott Reed</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+L">Linxi Fan</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yuke Zhu</a>, <a href="/search/cs?searchtype=author&query=Kautz%2C+J">Jan Kautz</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Z">Zhiding Yu</a>, <a href="/search/cs?searchtype=author&query=Kr%C3%A4henb%C3%BChl%2C+P">Philipp Kr盲henb眉hl</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+D">De-An Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05178v1-abstract-short" style="display: inline;"> We introduce Quantized Language-Image Pretraining (QLIP), a visual tokenization method that combines state-of-the-art reconstruction quality with state-of-the-art zero-shot image understanding. QLIP trains a binary-spherical-quantization-based autoencoder with reconstruction and language-image alignment objectives. We are the first to show that the two objectives do not need to be at odds. We bala… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05178v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05178v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05178v1-abstract-full" style="display: none;"> We introduce Quantized Language-Image Pretraining (QLIP), a visual tokenization method that combines state-of-the-art reconstruction quality with state-of-the-art zero-shot image understanding. QLIP trains a binary-spherical-quantization-based autoencoder with reconstruction and language-image alignment objectives. We are the first to show that the two objectives do not need to be at odds. We balance the two loss terms dynamically during training and show that a two-stage training pipeline effectively mixes the large-batch requirements of image-language pre-training with the memory bottleneck imposed by the reconstruction objective. We validate the effectiveness of QLIP for multimodal understanding and text-conditioned image generation with a single model. Specifically, QLIP serves as a drop-in replacement for the visual encoder for LLaVA and the image tokenizer for LlamaGen with comparable or even better performance. Finally, we demonstrate that QLIP enables a unified mixed-modality auto-regressive model for understanding and generation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05178v1-abstract-full').style.display = 'none'; document.getElementById('2502.05178v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Tech report. Project page: https://nvlabs.github.io/QLIP/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.02581">arXiv:2502.02581</a> <span> [<a href="https://arxiv.org/pdf/2502.02581">pdf</a>, <a href="https://arxiv.org/format/2502.02581">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> Hecate: Unlocking Efficient Sparse Model Training via Fully Sharded Sparse Data Parallelism </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qing%2C+Y">Yuhao Qing</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+G">Guichao Zhu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+F">Fanxin Li</a>, <a href="/search/cs?searchtype=author&query=Lei%2C+L">Lintian Lei</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Z">Zekai Sun</a>, <a href="/search/cs?searchtype=author&query=Guan%2C+X">Xiuxian Guan</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Shixiong Zhao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xusheng Chen</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+D">Dong Huang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Sen Wang</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+H">Heming Cui</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.02581v1-abstract-short" style="display: inline;"> Mixture-of-Experts (MoE) has emerged as a promising sparse paradigm for scaling up pre-trained models (PTMs) with remarkable cost-effectiveness. However, the dynamic nature of MoE leads to rapid fluctuations and imbalances in expert loads during training, resulting in significant straggler effects that hinder training performance when using expert parallelism (EP). Existing MoE training systems at… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02581v1-abstract-full').style.display = 'inline'; document.getElementById('2502.02581v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.02581v1-abstract-full" style="display: none;"> Mixture-of-Experts (MoE) has emerged as a promising sparse paradigm for scaling up pre-trained models (PTMs) with remarkable cost-effectiveness. However, the dynamic nature of MoE leads to rapid fluctuations and imbalances in expert loads during training, resulting in significant straggler effects that hinder training performance when using expert parallelism (EP). Existing MoE training systems attempt to mitigate these effects through expert rearrangement strategies, but they face challenges in terms of memory efficiency and timeliness of rearrangement. This paper proposes Fully Sharded Sparse Data Parallelism (FSSDP), an innovative approach that tackles the parallelization of MoE layers and potential straggler effects caused by imbalanced expert loads from a new perspective. FSSDP fully shards the parameters and optimizer states of MoE layers across devices and sparsely materializes MoE parameters from scratch in each iteration with two sparse collectives SparseAllGather and SparseReduceScatter. We build Hecate, a high-performance MoE training system that incorporates FSSDP to fully unlock its potential. Hecate introduces heterogeneous sharding, sparse materialization, and re-materialization techniques to construct flexible and efficient expert placements with low memory and communication overhead. Our evaluation reveals that Hecate achieves up to 3.54x speedup compared over state-of-the-art MoE training systems and consistently demonstrates improvements across model architectures and hardware environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02581v1-abstract-full').style.display = 'none'; document.getElementById('2502.02581v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.01187">arXiv:2502.01187</a> <span> [<a href="https://arxiv.org/pdf/2502.01187">pdf</a>, <a href="https://arxiv.org/format/2502.01187">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Skewed Memorization in Large Language Models: Quantification and Decomposition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+H">Hao Li</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+D">Di Huang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Ziyu Wang</a>, <a href="/search/cs?searchtype=author&query=Rahmani%2C+A+M">Amir M. Rahmani</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.01187v1-abstract-short" style="display: inline;"> Memorization in Large Language Models (LLMs) poses privacy and security risks, as models may unintentionally reproduce sensitive or copyrighted data. Existing analyses focus on average-case scenarios, often neglecting the highly skewed distribution of memorization. This paper examines memorization in LLM supervised fine-tuning (SFT), exploring its relationships with training duration, dataset size… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01187v1-abstract-full').style.display = 'inline'; document.getElementById('2502.01187v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.01187v1-abstract-full" style="display: none;"> Memorization in Large Language Models (LLMs) poses privacy and security risks, as models may unintentionally reproduce sensitive or copyrighted data. Existing analyses focus on average-case scenarios, often neglecting the highly skewed distribution of memorization. This paper examines memorization in LLM supervised fine-tuning (SFT), exploring its relationships with training duration, dataset size, and inter-sample similarity. By analyzing memorization probabilities over sequence lengths, we link this skewness to the token generation process, offering insights for estimating memorization and comparing it to established metrics. Through theoretical analysis and empirical evaluation, we provide a comprehensive understanding of memorization behaviors and propose strategies to detect and mitigate risks, contributing to more privacy-preserving LLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01187v1-abstract-full').style.display = 'none'; document.getElementById('2502.01187v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.18905">arXiv:2501.18905</a> <span> [<a href="https://arxiv.org/pdf/2501.18905">pdf</a>, <a href="https://arxiv.org/format/2501.18905">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Emerging Technologies">cs.ET</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Quantum Physics">quant-ph</span> </div> </div> <p class="title is-5 mathjax"> From Bits to Qubits: Challenges in Classical-Quantum Integration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kulkarni%2C+S+P">Sudhanshu Pravin Kulkarni</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+D+E">Daniel E. Huang</a>, <a href="/search/cs?searchtype=author&query=Bethel%2C+E+W">E. Wes Bethel</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.18905v1-abstract-short" style="display: inline;"> While quantum computing holds immense potential for tackling previously intractable problems, its current practicality remains limited. A critical aspect of realizing quantum utility is the ability to efficiently interface with data from the classical world. This research focuses on the crucial phase of quantum encoding, which enables the transformation of classical information into quantum states… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.18905v1-abstract-full').style.display = 'inline'; document.getElementById('2501.18905v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.18905v1-abstract-full" style="display: none;"> While quantum computing holds immense potential for tackling previously intractable problems, its current practicality remains limited. A critical aspect of realizing quantum utility is the ability to efficiently interface with data from the classical world. This research focuses on the crucial phase of quantum encoding, which enables the transformation of classical information into quantum states for processing within quantum systems. We focus on three prominent encoding models: Phase Encoding, Qubit Lattice, and Flexible Representation of Quantum Images (FRQI) for cost and efficiency analysis. The aim of quantifying their different characteristics is to analyze their impact on quantum processing workflows. This comparative analysis offers valuable insights into their limitations and potential to accelerate the development of practical quantum computing solutions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.18905v1-abstract-full').style.display = 'none'; document.getElementById('2501.18905v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 Pages, 13 Figures, In press: 31st IEEE International Conference on High Performance Computing, Data, and Analytics (HiPC) - 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.15449">arXiv:2501.15449</a> <span> [<a href="https://arxiv.org/pdf/2501.15449">pdf</a>, <a href="https://arxiv.org/format/2501.15449">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Breaking the SSL-AL Barrier: A Synergistic Semi-Supervised Active Learning Framework for 3D Object Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zengran Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yanan Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jiaxin Chen</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+D">Di Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.15449v1-abstract-short" style="display: inline;"> To address the annotation burden in LiDAR-based 3D object detection, active learning (AL) methods offer a promising solution. However, traditional active learning approaches solely rely on a small amount of labeled data to train an initial model for data selection, overlooking the potential of leveraging the abundance of unlabeled data. Recently, attempts to integrate semi-supervised learning (SSL… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15449v1-abstract-full').style.display = 'inline'; document.getElementById('2501.15449v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.15449v1-abstract-full" style="display: none;"> To address the annotation burden in LiDAR-based 3D object detection, active learning (AL) methods offer a promising solution. However, traditional active learning approaches solely rely on a small amount of labeled data to train an initial model for data selection, overlooking the potential of leveraging the abundance of unlabeled data. Recently, attempts to integrate semi-supervised learning (SSL) into AL with the goal of leveraging unlabeled data have faced challenges in effectively resolving the conflict between the two paradigms, resulting in less satisfactory performance. To tackle this conflict, we propose a Synergistic Semi-Supervised Active Learning framework, dubbed as S-SSAL. Specifically, from the perspective of SSL, we propose a Collaborative PseudoScene Pre-training (CPSP) method that effectively learns from unlabeled data without introducing adverse effects. From the perspective of AL, we design a Collaborative Active Learning (CAL) method, which complements the uncertainty and diversity methods by model cascading. This allows us to fully exploit the potential of the CPSP pre-trained model. Extensive experiments conducted on KITTI and Waymo demonstrate the effectiveness of our S-SSAL framework. Notably, on the KITTI dataset, utilizing only 2% labeled data, S-SSAL can achieve performance comparable to models trained on the full dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15449v1-abstract-full').style.display = 'none'; document.getElementById('2501.15449v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.15062">arXiv:2501.15062</a> <span> [<a href="https://arxiv.org/pdf/2501.15062">pdf</a>, <a href="https://arxiv.org/format/2501.15062">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Exact Fit Attention in Node-Holistic Graph Convolutional Network for Improved EEG-Based Driver Fatigue Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+M">Meiyan Xu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Q">Qingqing Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+D">Duo Chen</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+Y">Yi Ding</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jingyuan Wang</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+P">Peipei Gu</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+Y">Yijie Pan</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+D">Deshuang Huang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xun Zhang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+J">Jiayang Guo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.15062v1-abstract-short" style="display: inline;"> EEG-based fatigue monitoring can effectively reduce the incidence of related traffic accidents. In the past decade, with the advancement of deep learning, convolutional neural networks (CNN) have been increasingly used for EEG signal processing. However, due to the data's non-Euclidean characteristics, existing CNNs may lose important spatial information from EEG, specifically channel correlation.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15062v1-abstract-full').style.display = 'inline'; document.getElementById('2501.15062v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.15062v1-abstract-full" style="display: none;"> EEG-based fatigue monitoring can effectively reduce the incidence of related traffic accidents. In the past decade, with the advancement of deep learning, convolutional neural networks (CNN) have been increasingly used for EEG signal processing. However, due to the data's non-Euclidean characteristics, existing CNNs may lose important spatial information from EEG, specifically channel correlation. Thus, we propose the node-holistic graph convolutional network (NHGNet), a model that uses graphic convolution to dynamically learn each channel's features. With exact fit attention optimization, the network captures inter-channel correlations through a trainable adjacency matrix. The interpretability is enhanced by revealing critical areas of brain activity and their interrelations in various mental states. In validations on two public datasets, NHGNet outperforms the SOTAs. Specifically, in the intra-subject, NHGNet improved detection accuracy by at least 2.34% and 3.42%, and in the inter-subjects, it improved by at least 2.09% and 15.06%. Visualization research on the model revealed that the central parietal area plays an important role in detecting fatigue levels, whereas the frontal and temporal lobes are essential for maintaining vigilance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15062v1-abstract-full').style.display = 'none'; document.getElementById('2501.15062v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.14818">arXiv:2501.14818</a> <span> [<a href="https://arxiv.org/pdf/2501.14818">pdf</a>, <a href="https://arxiv.org/format/2501.14818">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Eagle 2: Building Post-Training Data Strategies from Scratch for Frontier Vision-Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhiqi Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+G">Guo Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Shilong Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shihao Wang</a>, <a href="/search/cs?searchtype=author&query=VS%2C+V">Vibashan VS</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+Y">Yishen Ji</a>, <a href="/search/cs?searchtype=author&query=Lan%2C+S">Shiyi Lan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hao Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yilin Zhao</a>, <a href="/search/cs?searchtype=author&query=Radhakrishnan%2C+S">Subhashree Radhakrishnan</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+N">Nadine Chang</a>, <a href="/search/cs?searchtype=author&query=Sapra%2C+K">Karan Sapra</a>, <a href="/search/cs?searchtype=author&query=Deshmukh%2C+A+S">Amala Sanjay Deshmukh</a>, <a href="/search/cs?searchtype=author&query=Rintamaki%2C+T">Tuomas Rintamaki</a>, <a href="/search/cs?searchtype=author&query=Le%2C+M">Matthieu Le</a>, <a href="/search/cs?searchtype=author&query=Karmanov%2C+I">Ilia Karmanov</a>, <a href="/search/cs?searchtype=author&query=Voegtle%2C+L">Lukas Voegtle</a>, <a href="/search/cs?searchtype=author&query=Fischer%2C+P">Philipp Fischer</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+D">De-An Huang</a>, <a href="/search/cs?searchtype=author&query=Roman%2C+T">Timo Roman</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+T">Tong Lu</a>, <a href="/search/cs?searchtype=author&query=Alvarez%2C+J+M">Jose M. Alvarez</a>, <a href="/search/cs?searchtype=author&query=Catanzaro%2C+B">Bryan Catanzaro</a>, <a href="/search/cs?searchtype=author&query=Kautz%2C+J">Jan Kautz</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+A">Andrew Tao</a> , et al. (2 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.14818v1-abstract-short" style="display: inline;"> Recently, promising progress has been made by open-source vision-language models (VLMs) in bringing their capabilities closer to those of proprietary frontier models. However, most open-source models only publish their final model weights, leaving the critical details of data strategies and implementation largely opaque. In this work, we address VLM post-training from a data-centric perspective, s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.14818v1-abstract-full').style.display = 'inline'; document.getElementById('2501.14818v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.14818v1-abstract-full" style="display: none;"> Recently, promising progress has been made by open-source vision-language models (VLMs) in bringing their capabilities closer to those of proprietary frontier models. However, most open-source models only publish their final model weights, leaving the critical details of data strategies and implementation largely opaque. In this work, we address VLM post-training from a data-centric perspective, showing the key role of data strategy in developing frontier VLMs. By studying and building our post-training data strategy from scratch, we share detailed insights into the development processes, aiming to benefit the development of competitive models for the open-source community. Our introduced data strategy, together with training recipes and model design, leads to a family of performant VLMs named Eagle2. Specifically, Eagle2-9B achieves state-of-the-art results across various multimodal benchmarks, matching certain competitive models with up to 70B parameters. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.14818v1-abstract-full').style.display = 'none'; document.getElementById('2501.14818v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.14625">arXiv:2501.14625</a> <span> [<a href="https://arxiv.org/pdf/2501.14625">pdf</a>, <a href="https://arxiv.org/format/2501.14625">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Accelerated Preference Elicitation with LLM-Based Proxies </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+D">David Huang</a>, <a href="/search/cs?searchtype=author&query=Marmolejo-Coss%C3%ADo%2C+F">Francisco Marmolejo-Coss铆o</a>, <a href="/search/cs?searchtype=author&query=Lock%2C+E">Edwin Lock</a>, <a href="/search/cs?searchtype=author&query=Parkes%2C+D">David Parkes</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.14625v1-abstract-short" style="display: inline;"> Bidders in combinatorial auctions face significant challenges when describing their preferences to an auctioneer. Classical work on preference elicitation focuses on query-based techniques inspired from proper learning--often via proxies that interface between bidders and an auction mechanism--to incrementally learn bidder preferences as needed to compute efficient allocations. Although such elici… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.14625v1-abstract-full').style.display = 'inline'; document.getElementById('2501.14625v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.14625v1-abstract-full" style="display: none;"> Bidders in combinatorial auctions face significant challenges when describing their preferences to an auctioneer. Classical work on preference elicitation focuses on query-based techniques inspired from proper learning--often via proxies that interface between bidders and an auction mechanism--to incrementally learn bidder preferences as needed to compute efficient allocations. Although such elicitation mechanisms enjoy theoretical query efficiency, the amount of communication required may still be too cognitively taxing in practice. We propose a family of efficient LLM-based proxy designs for eliciting preferences from bidders using natural language. Our proposed mechanism combines LLM pipelines and DNF-proper-learning techniques to quickly approximate preferences when communication is limited. To validate our approach, we create a testing sandbox for elicitation mechanisms that communicate in natural language. In our experiments, our most promising LLM proxy design reaches approximately efficient outcomes with five times fewer queries than classical proper learning based elicitation mechanisms. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.14625v1-abstract-full').style.display = 'none'; document.getElementById('2501.14625v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.14231">arXiv:2501.14231</a> <span> [<a href="https://arxiv.org/pdf/2501.14231">pdf</a>, <a href="https://arxiv.org/format/2501.14231">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Micro-macro Wavelet-based Gaussian Splatting for 3D Reconstruction from Unconstrained Images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yihui Li</a>, <a href="/search/cs?searchtype=author&query=Lv%2C+C">Chengxin Lv</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+H">Hongyu Yang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+D">Di Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.14231v1-abstract-short" style="display: inline;"> 3D reconstruction from unconstrained image collections presents substantial challenges due to varying appearances and transient occlusions. In this paper, we introduce Micro-macro Wavelet-based Gaussian Splatting (MW-GS), a novel approach designed to enhance 3D reconstruction by disentangling scene representations into global, refined, and intrinsic components. The proposed method features two key… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.14231v1-abstract-full').style.display = 'inline'; document.getElementById('2501.14231v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.14231v1-abstract-full" style="display: none;"> 3D reconstruction from unconstrained image collections presents substantial challenges due to varying appearances and transient occlusions. In this paper, we introduce Micro-macro Wavelet-based Gaussian Splatting (MW-GS), a novel approach designed to enhance 3D reconstruction by disentangling scene representations into global, refined, and intrinsic components. The proposed method features two key innovations: Micro-macro Projection, which allows Gaussian points to capture details from feature maps across multiple scales with enhanced diversity; and Wavelet-based Sampling, which leverages frequency domain information to refine feature representations and significantly improve the modeling of scene appearances. Additionally, we incorporate a Hierarchical Residual Fusion Network to seamlessly integrate these features. Extensive experiments demonstrate that MW-GS delivers state-of-the-art rendering performance, surpassing existing methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.14231v1-abstract-full').style.display = 'none'; document.getElementById('2501.14231v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 6 figures,accepted by AAAI 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.08326">arXiv:2501.08326</a> <span> [<a href="https://arxiv.org/pdf/2501.08326">pdf</a>, <a href="https://arxiv.org/format/2501.08326">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Omni-RGPT: Unifying Image and Video Region-level Understanding via Token Marks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Heo%2C+M">Miran Heo</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+M">Min-Hung Chen</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+D">De-An Huang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Sifei Liu</a>, <a href="/search/cs?searchtype=author&query=Radhakrishnan%2C+S">Subhashree Radhakrishnan</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+S+J">Seon Joo Kim</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y+F">Yu-Chiang Frank Wang</a>, <a href="/search/cs?searchtype=author&query=Hachiuma%2C+R">Ryo Hachiuma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.08326v1-abstract-short" style="display: inline;"> We present Omni-RGPT, a multimodal large language model designed to facilitate region-level comprehension for both images and videos. To achieve consistent region representation across spatio-temporal dimensions, we introduce Token Mark, a set of tokens highlighting the target regions within the visual feature space. These tokens are directly embedded into spatial regions using region prompts (e.g… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.08326v1-abstract-full').style.display = 'inline'; document.getElementById('2501.08326v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.08326v1-abstract-full" style="display: none;"> We present Omni-RGPT, a multimodal large language model designed to facilitate region-level comprehension for both images and videos. To achieve consistent region representation across spatio-temporal dimensions, we introduce Token Mark, a set of tokens highlighting the target regions within the visual feature space. These tokens are directly embedded into spatial regions using region prompts (e.g., boxes or masks) and simultaneously incorporated into the text prompt to specify the target, establishing a direct connection between visual and text tokens. To further support robust video understanding without requiring tracklets, we introduce an auxiliary task that guides Token Mark by leveraging the consistency of the tokens, enabling stable region interpretation across the video. Additionally, we introduce a large-scale region-level video instruction dataset (RegVID-300k). Omni-RGPT achieves state-of-the-art results on image and video-based commonsense reasoning benchmarks while showing strong performance in captioning and referring expression comprehension tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.08326v1-abstract-full').style.display = 'none'; document.getElementById('2501.08326v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://miranheo.github.io/omni-rgpt/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.06884">arXiv:2501.06884</a> <span> [<a href="https://arxiv.org/pdf/2501.06884">pdf</a>, <a href="https://arxiv.org/format/2501.06884">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Transforming Vision Transformer: Towards Efficient Multi-Task Asynchronous Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhong%2C+H">Hanwen Zhong</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jiaxin Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yutong Zhang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+D">Di Huang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yunhong Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.06884v1-abstract-short" style="display: inline;"> Multi-Task Learning (MTL) for Vision Transformer aims at enhancing the model capability by tackling multiple tasks simultaneously. Most recent works have predominantly focused on designing Mixture-of-Experts (MoE) structures and in tegrating Low-Rank Adaptation (LoRA) to efficiently perform multi-task learning. However, their rigid combination hampers both the optimization of MoE and the ef fectiv… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06884v1-abstract-full').style.display = 'inline'; document.getElementById('2501.06884v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.06884v1-abstract-full" style="display: none;"> Multi-Task Learning (MTL) for Vision Transformer aims at enhancing the model capability by tackling multiple tasks simultaneously. Most recent works have predominantly focused on designing Mixture-of-Experts (MoE) structures and in tegrating Low-Rank Adaptation (LoRA) to efficiently perform multi-task learning. However, their rigid combination hampers both the optimization of MoE and the ef fectiveness of reparameterization of LoRA, leading to sub-optimal performance and low inference speed. In this work, we propose a novel approach dubbed Efficient Multi-Task Learning (EMTAL) by transforming a pre-trained Vision Transformer into an efficient multi-task learner during training, and reparameterizing the learned structure for efficient inference. Specifically, we firstly develop the MoEfied LoRA structure, which decomposes the pre-trained Transformer into a low-rank MoE structure and employ LoRA to fine-tune the parameters. Subsequently, we take into account the intrinsic asynchronous nature of multi-task learning and devise a learning Quality Retaining (QR) optimization mechanism, by leveraging the historical high-quality class logits to prevent a well-trained task from performance degradation. Finally, we design a router fading strategy to integrate the learned parameters into the original Transformer, archiving efficient inference. Extensive experiments on public benchmarks demonstrate the superiority of our method, compared to the state-of-the-art multi-task learning approaches. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06884v1-abstract-full').style.display = 'none'; document.getElementById('2501.06884v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by the 38th Conference on Neural Information Processing Systems (NeurIPS 2024)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.06465">arXiv:2501.06465</a> <span> [<a href="https://arxiv.org/pdf/2501.06465">pdf</a>, <a href="https://arxiv.org/format/2501.06465">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> MedCT: A Clinical Terminology Graph for Generative AI Applications in Healthcare </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Ye Chen</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+D">Dongdong Huang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+H">Haoyun Xu</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+C">Cong Fu</a>, <a href="/search/cs?searchtype=author&query=Sheng%2C+L">Lin Sheng</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Q">Qingli Zhou</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+Y">Yuqiang Shen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+K">Kai Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.06465v2-abstract-short" style="display: inline;"> We introduce the world's first clinical terminology for the Chinese healthcare community, namely MedCT, accompanied by a clinical foundation model MedBERT and an entity linking model MedLink. The MedCT system enables standardized and programmable representation of Chinese clinical data, successively stimulating the development of new medicines, treatment pathways, and better patient outcomes for t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06465v2-abstract-full').style.display = 'inline'; document.getElementById('2501.06465v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.06465v2-abstract-full" style="display: none;"> We introduce the world's first clinical terminology for the Chinese healthcare community, namely MedCT, accompanied by a clinical foundation model MedBERT and an entity linking model MedLink. The MedCT system enables standardized and programmable representation of Chinese clinical data, successively stimulating the development of new medicines, treatment pathways, and better patient outcomes for the populous Chinese community. Moreover, the MedCT knowledge graph provides a principled mechanism to minimize the hallucination problem of large language models (LLMs), therefore achieving significant levels of accuracy and safety in LLM-based clinical applications. By leveraging the LLMs' emergent capabilities of generativeness and expressiveness, we were able to rapidly built a production-quality terminology system and deployed to real-world clinical field within three months, while classical terminologies like SNOMED CT have gone through more than twenty years development. Our experiments show that the MedCT system achieves state-of-the-art (SOTA) performance in semantic matching and entity linking tasks, not only for Chinese but also for English. We also conducted a longitudinal field experiment by applying MedCT and LLMs in a representative spectrum of clinical tasks, including electronic health record (EHR) auto-generation and medical document search for diagnostic decision making. Our study shows a multitude of values of MedCT for clinical workflows and patient outcomes, especially in the new genre of clinical LLM applications. We present our approach in sufficient engineering detail, such that implementing a clinical terminology for other non-English societies should be readily reproducible. We openly release our terminology, models and algorithms, along with real-world clinical datasets for the development. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06465v2-abstract-full').style.display = 'none'; document.getElementById('2501.06465v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.04315">arXiv:2501.04315</a> <span> [<a href="https://arxiv.org/pdf/2501.04315">pdf</a>, <a href="https://arxiv.org/format/2501.04315">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> RoRA: Efficient Fine-Tuning of LLM with Reliability Optimization for Rank Adaptation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jun Liu</a>, <a href="/search/cs?searchtype=author&query=Kong%2C+Z">Zhenglun Kong</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+P">Peiyan Dong</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+C">Changdi Yang</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+X">Xuan Shen</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+P">Pu Zhao</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+H">Hao Tang</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+G">Geng Yuan</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+W">Wei Niu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wenbin Zhang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+X">Xue Lin</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+D">Dong Huang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yanzhi Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.04315v2-abstract-short" style="display: inline;"> Fine-tuning helps large language models (LLM) recover degraded information and enhance task performance. Although Low-Rank Adaptation (LoRA) is widely used and effective for fine-tuning, we have observed that its scaling factor can limit or even reduce performance as the rank size increases. To address this issue, we propose RoRA (Rank-adaptive Reliability Optimization), a simple yet effective met… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.04315v2-abstract-full').style.display = 'inline'; document.getElementById('2501.04315v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.04315v2-abstract-full" style="display: none;"> Fine-tuning helps large language models (LLM) recover degraded information and enhance task performance. Although Low-Rank Adaptation (LoRA) is widely used and effective for fine-tuning, we have observed that its scaling factor can limit or even reduce performance as the rank size increases. To address this issue, we propose RoRA (Rank-adaptive Reliability Optimization), a simple yet effective method for optimizing LoRA's scaling factor. By replacing $伪/r$ with $伪/\sqrt{r}$, RoRA ensures improved performance as rank size increases. Moreover, RoRA enhances low-rank adaptation in fine-tuning uncompressed models and excels in the more challenging task of accuracy recovery when fine-tuning pruned models. Extensive experiments demonstrate the effectiveness of RoRA in fine-tuning both uncompressed and pruned models. RoRA surpasses the state-of-the-art (SOTA) in average accuracy and robustness on LLaMA-7B/13B, LLaMA2-7B, and LLaMA3-8B, specifically outperforming LoRA and DoRA by 6.5% and 2.9% on LLaMA-7B, respectively. In pruned model fine-tuning, RoRA shows significant advantages; for SHEARED-LLAMA-1.3, a LLaMA-7B with 81.4% pruning, RoRA achieves 5.7% higher average accuracy than LoRA and 3.9% higher than DoRA. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.04315v2-abstract-full').style.display = 'none'; document.getElementById('2501.04315v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.04259">arXiv:2501.04259</a> <span> [<a href="https://arxiv.org/pdf/2501.04259">pdf</a>, <a href="https://arxiv.org/format/2501.04259">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Numerical Analysis">math.NA</span> </div> </div> <p class="title is-5 mathjax"> Stable Derivative Free Gaussian Mixture Variational Inference for Bayesian Inverse Problems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Che%2C+B">Baojun Che</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yifan Chen</a>, <a href="/search/cs?searchtype=author&query=Huan%2C+Z">Zhenghao Huan</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+D+Z">Daniel Zhengyu Huang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Weijie Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.04259v1-abstract-short" style="display: inline;"> This paper is concerned with the approximation of probability distributions known up to normalization constants, with a focus on Bayesian inference for large-scale inverse problems in scientific computing. In this context, key challenges include costly repeated evaluations of forward models, multimodality, and inaccessible gradients for the forward model. To address them, we develop a variational… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.04259v1-abstract-full').style.display = 'inline'; document.getElementById('2501.04259v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.04259v1-abstract-full" style="display: none;"> This paper is concerned with the approximation of probability distributions known up to normalization constants, with a focus on Bayesian inference for large-scale inverse problems in scientific computing. In this context, key challenges include costly repeated evaluations of forward models, multimodality, and inaccessible gradients for the forward model. To address them, we develop a variational inference framework that combines Fisher-Rao natural gradient with specialized quadrature rules to enable derivative free updates of Gaussian mixture variational families. The resulting method, termed Derivative Free Gaussian Mixture Variational Inference (DF-GMVI), guarantees covariance positivity and affine invariance, offering a stable and efficient framework for approximating complex posterior distributions. The effectiveness of DF-GMVI is demonstrated through numerical experiments on challenging scenarios, including distributions with multiple modes, infinitely many modes, and curved modes in spaces with up to hundreds of dimensions. The method's practicality is further demonstrated in a large-scale application, where it successfully recovers the initial conditions of the Navier-Stokes equations from solution data at positive times. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.04259v1-abstract-full').style.display = 'none'; document.getElementById('2501.04259v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">25 pages, 10 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.03562">arXiv:2501.03562</a> <span> [<a href="https://arxiv.org/pdf/2501.03562">pdf</a>, <a href="https://arxiv.org/format/2501.03562">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Rethinking Adversarial Attacks in Reinforcement Learning from Policy Distribution Perspective </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Duan%2C+T">Tianyang Duan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zongyuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Z">Zheng Lin</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yue Gao</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+L">Ling Xiong</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+Y">Yong Cui</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+H">Hongbin Liang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xianhao Chen</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+H">Heming Cui</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+D">Dong Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.03562v2-abstract-short" style="display: inline;"> Deep Reinforcement Learning (DRL) suffers from uncertainties and inaccuracies in the observation signal in realworld applications. Adversarial attack is an effective method for evaluating the robustness of DRL agents. However, existing attack methods targeting individual sampled actions have limited impacts on the overall policy distribution, particularly in continuous action spaces. To address th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03562v2-abstract-full').style.display = 'inline'; document.getElementById('2501.03562v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.03562v2-abstract-full" style="display: none;"> Deep Reinforcement Learning (DRL) suffers from uncertainties and inaccuracies in the observation signal in realworld applications. Adversarial attack is an effective method for evaluating the robustness of DRL agents. However, existing attack methods targeting individual sampled actions have limited impacts on the overall policy distribution, particularly in continuous action spaces. To address these limitations, we propose the Distribution-Aware Projected Gradient Descent attack (DAPGD). DAPGD uses distribution similarity as the gradient perturbation input to attack the policy network, which leverages the entire policy distribution rather than relying on individual samples. We utilize the Bhattacharyya distance in DAPGD to measure policy similarity, enabling sensitive detection of subtle but critical differences between probability distributions. Our experiment results demonstrate that DAPGD achieves SOTA results compared to the baselines in three robot navigation tasks, achieving an average 22.03% higher reward drop compared to the best baseline. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03562v2-abstract-full').style.display = 'none'; document.getElementById('2501.03562v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 2 figures, 2 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.20954">arXiv:2412.20954</a> <span> [<a href="https://arxiv.org/pdf/2412.20954">pdf</a>, <a href="https://arxiv.org/format/2412.20954">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> </div> </div> <p class="title is-5 mathjax"> AGON: Automated Design Framework for Customizing Processors from ISA Documents </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+C">Chongxiao Li</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+D">Di Huang</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+P">Pengwei Jin</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+T">Tianyun Ma</a>, <a href="/search/cs?searchtype=author&query=Han%2C+H">Husheng Han</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+S">Shuyao Cheng</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+Y">Yifan Hao</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yongwei Zhao</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+G">Guanglin Xu</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Z">Zidong Du</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Rui Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiaqing Li</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+Y">Yuanbo Wen</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+X">Xing Hu</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Q">Qi Guo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.20954v2-abstract-short" style="display: inline;"> Customized processors are attractive solutions for vast domain-specific applications due to their high energy efficiency. However, designing a processor in traditional flows is time-consuming and expensive. To address this, researchers have explored methods including the use of agile development tools like Chisel or SpinalHDL, high-level synthesis (HLS) from programming languages like C or SystemC… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.20954v2-abstract-full').style.display = 'inline'; document.getElementById('2412.20954v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.20954v2-abstract-full" style="display: none;"> Customized processors are attractive solutions for vast domain-specific applications due to their high energy efficiency. However, designing a processor in traditional flows is time-consuming and expensive. To address this, researchers have explored methods including the use of agile development tools like Chisel or SpinalHDL, high-level synthesis (HLS) from programming languages like C or SystemC, and more recently, leveraging large language models (LLMs) to generate hardware description language (HDL) code from natural language descriptions. However, each method has limitations in terms of expressiveness, correctness, and performance, leading to a persistent contradiction between the level of automation and the effectiveness of the design. Overall, how to automatically design highly efficient and practical processors with minimal human effort remains a challenge. In this paper, we propose AGON, a novel framework designed to leverage LLMs for the efficient design of out-of-order (OoO) customized processors with minimal human effort. Central to AGON is the nano-operator function (nOP function) based Intermediate Representation (IR), which bridges high-level descriptions and hardware implementations while decoupling functionality from performance optimization, thereby providing an automatic design framework that is expressive and efficient, has correctness guarantees, and enables PPA (Power, Performance, and Area) optimization. Experimental results show that superior to previous LLM-assisted automatic design flows, AGON facilitates designing a series of customized OoO processors that achieve on average 2.35 $\times$ speedup compared with BOOM, a general-purpose CPU designed by experts, with minimal design effort. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.20954v2-abstract-full').style.display = 'none'; document.getElementById('2412.20954v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.19712">arXiv:2412.19712</a> <span> [<a href="https://arxiv.org/pdf/2412.19712">pdf</a>, <a href="https://arxiv.org/format/2412.19712">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> From Elements to Design: A Layered Approach for Automatic Graphic Design Composition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+J">Jiawei Lin</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+S">Shizhao Sun</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+D">Danqing Huang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+T">Ting Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Ji Li</a>, <a href="/search/cs?searchtype=author&query=Bian%2C+J">Jiang Bian</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.19712v1-abstract-short" style="display: inline;"> In this work, we investigate automatic design composition from multimodal graphic elements. Although recent studies have developed various generative models for graphic design, they usually face the following limitations: they only focus on certain subtasks and are far from achieving the design composition task; they do not consider the hierarchical information of graphic designs during the genera… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.19712v1-abstract-full').style.display = 'inline'; document.getElementById('2412.19712v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.19712v1-abstract-full" style="display: none;"> In this work, we investigate automatic design composition from multimodal graphic elements. Although recent studies have developed various generative models for graphic design, they usually face the following limitations: they only focus on certain subtasks and are far from achieving the design composition task; they do not consider the hierarchical information of graphic designs during the generation process. To tackle these issues, we introduce the layered design principle into Large Multimodal Models (LMMs) and propose a novel approach, called LaDeCo, to accomplish this challenging task. Specifically, LaDeCo first performs layer planning for a given element set, dividing the input elements into different semantic layers according to their contents. Based on the planning results, it subsequently predicts element attributes that control the design composition in a layer-wise manner, and includes the rendered image of previously generated layers into the context. With this insightful design, LaDeCo decomposes the difficult task into smaller manageable steps, making the generation process smoother and clearer. The experimental results demonstrate the effectiveness of LaDeCo in design composition. Furthermore, we show that LaDeCo enables some interesting applications in graphic design, such as resolution adjustment, element filling, design variation, etc. In addition, it even outperforms the specialized models in some design subtasks without any task-specific training. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.19712v1-abstract-full').style.display = 'none'; document.getElementById('2412.19712v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project Page: $\href{https://elements2design.github.io/}{\text{elements2design}}$</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.19149">arXiv:2412.19149</a> <span> [<a href="https://arxiv.org/pdf/2412.19149">pdf</a>, <a href="https://arxiv.org/format/2412.19149">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Generating Editable Head Avatars with 3D Gaussian GANs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+G">Guohao Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+H">Hongyu Yang</a>, <a href="/search/cs?searchtype=author&query=Men%2C+Y">Yifang Men</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+D">Di Huang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Weixin Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+R">Ruijie Yang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yunhong Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.19149v1-abstract-short" style="display: inline;"> Generating animatable and editable 3D head avatars is essential for various applications in computer vision and graphics. Traditional 3D-aware generative adversarial networks (GANs), often using implicit fields like Neural Radiance Fields (NeRF), achieve photorealistic and view-consistent 3D head synthesis. However, these methods face limitations in deformation flexibility and editability, hinderi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.19149v1-abstract-full').style.display = 'inline'; document.getElementById('2412.19149v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.19149v1-abstract-full" style="display: none;"> Generating animatable and editable 3D head avatars is essential for various applications in computer vision and graphics. Traditional 3D-aware generative adversarial networks (GANs), often using implicit fields like Neural Radiance Fields (NeRF), achieve photorealistic and view-consistent 3D head synthesis. However, these methods face limitations in deformation flexibility and editability, hindering the creation of lifelike and easily modifiable 3D heads. We propose a novel approach that enhances the editability and animation control of 3D head avatars by incorporating 3D Gaussian Splatting (3DGS) as an explicit 3D representation. This method enables easier illumination control and improved editability. Central to our approach is the Editable Gaussian Head (EG-Head) model, which combines a 3D Morphable Model (3DMM) with texture maps, allowing precise expression control and flexible texture editing for accurate animation while preserving identity. To capture complex non-facial geometries like hair, we use an auxiliary set of 3DGS and tri-plane features. Extensive experiments demonstrate that our approach delivers high-quality 3D-aware synthesis with state-of-the-art controllability. Our code and models are available at https://github.com/liguohao96/EGG3D. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.19149v1-abstract-full').style.display = 'none'; document.getElementById('2412.19149v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.17373">arXiv:2412.17373</a> <span> [<a href="https://arxiv.org/pdf/2412.17373">pdf</a>, <a href="https://arxiv.org/format/2412.17373">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> FRTP: Federating Route Search Records to Enhance Long-term Traffic Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ge%2C+H">Hangli Ge</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xiaojie Yang</a>, <a href="/search/cs?searchtype=author&query=Matsunaga%2C+I">Itsuki Matsunaga</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+D">Dizhi Huang</a>, <a href="/search/cs?searchtype=author&query=Koshizuka%2C+N">Noboru Koshizuka</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.17373v1-abstract-short" style="display: inline;"> Accurate traffic prediction, especially predicting traffic conditions several days in advance is essential for intelligent transportation systems (ITS). Such predictions enable mid- and long-term traffic optimization, which is crucial for efficient transportation planning. However, the inclusion of diverse external features, alongside the complexities of spatial relationships and temporal uncertai… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.17373v1-abstract-full').style.display = 'inline'; document.getElementById('2412.17373v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.17373v1-abstract-full" style="display: none;"> Accurate traffic prediction, especially predicting traffic conditions several days in advance is essential for intelligent transportation systems (ITS). Such predictions enable mid- and long-term traffic optimization, which is crucial for efficient transportation planning. However, the inclusion of diverse external features, alongside the complexities of spatial relationships and temporal uncertainties, significantly increases the complexity of forecasting models. Additionally, traditional approaches have handled data preprocessing separately from the learning model, leading to inefficiencies caused by repeated trials of preprocessing and training. In this study, we propose a federated architecture capable of learning directly from raw data with varying features and time granularities or lengths. The model adopts a unified design that accommodates different feature types, time scales, and temporal periods. Our experiments focus on federating route search records and begin by processing raw data within the model framework. Unlike traditional models, this approach integrates the data federation phase into the learning process, enabling compatibility with various time frequencies and input/output configurations. The accuracy of the proposed model is demonstrated through evaluations using diverse learning patterns and parameter settings. The results show that online search log data is useful for forecasting long-term traffic, highlighting the model's adaptability and efficiency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.17373v1-abstract-full').style.display = 'none'; document.getElementById('2412.17373v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by IEEE BigData 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.15790">arXiv:2412.15790</a> <span> [<a href="https://arxiv.org/pdf/2412.15790">pdf</a>, <a href="https://arxiv.org/format/2412.15790">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantitative Methods">q-bio.QM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> GraphSeqLM: A Unified Graph Language Framework for Omic Graph Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Heming Zhang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+D">Di Huang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yixin Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+F">Fuhai Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.15790v1-abstract-short" style="display: inline;"> The integration of multi-omic data is pivotal for understanding complex diseases, but its high dimensionality and noise present significant challenges. Graph Neural Networks (GNNs) offer a robust framework for analyzing large-scale signaling pathways and protein-protein interaction networks, yet they face limitations in expressivity when capturing intricate biological relationships. To address thi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.15790v1-abstract-full').style.display = 'inline'; document.getElementById('2412.15790v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.15790v1-abstract-full" style="display: none;"> The integration of multi-omic data is pivotal for understanding complex diseases, but its high dimensionality and noise present significant challenges. Graph Neural Networks (GNNs) offer a robust framework for analyzing large-scale signaling pathways and protein-protein interaction networks, yet they face limitations in expressivity when capturing intricate biological relationships. To address this, we propose Graph Sequence Language Model (GraphSeqLM), a framework that enhances GNNs with biological sequence embeddings generated by Large Language Models (LLMs). These embeddings encode structural and biological properties of DNA, RNA, and proteins, augmenting GNNs with enriched features for analyzing sample-specific multi-omic data. By integrating topological, sequence-derived, and biological information, GraphSeqLM demonstrates superior predictive accuracy and outperforms existing methods, paving the way for more effective multi-omic data integration in precision medicine. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.15790v1-abstract-full').style.display = 'none'; document.getElementById('2412.15790v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.12516">arXiv:2412.12516</a> <span> [<a href="https://arxiv.org/pdf/2412.12516">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computational Finance">q-fin.CP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Enhanced Momentum with Momentum Transformers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mason%2C+M">Max Mason</a>, <a href="/search/cs?searchtype=author&query=Jagirdar%2C+W+A">Waasi A Jagirdar</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+D">David Huang</a>, <a href="/search/cs?searchtype=author&query=Murugan%2C+R">Rahul Murugan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.12516v1-abstract-short" style="display: inline;"> The primary objective of this research is to build a Momentum Transformer that is expected to outperform benchmark time-series momentum and mean-reversion trading strategies. We extend the ideas introduced in the paper Trading with the Momentum Transformer: An Intelligent and Interpretable Architecture to equities as the original paper primarily only builds upon futures and equity indices. Unlike… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.12516v1-abstract-full').style.display = 'inline'; document.getElementById('2412.12516v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.12516v1-abstract-full" style="display: none;"> The primary objective of this research is to build a Momentum Transformer that is expected to outperform benchmark time-series momentum and mean-reversion trading strategies. We extend the ideas introduced in the paper Trading with the Momentum Transformer: An Intelligent and Interpretable Architecture to equities as the original paper primarily only builds upon futures and equity indices. Unlike conventional Long Short-Term Memory (LSTM) models, which operate sequentially and are optimized for processing local patterns, an attention mechanism equips our architecture with direct access to all prior time steps in the training window. This hybrid design, combining attention with an LSTM, enables the model to capture long-term dependencies, enhance performance in scenarios accounting for transaction costs, and seamlessly adapt to evolving market conditions, such as those witnessed during the Covid Pandemic. We average 4.14% returns which is similar to the original papers results. Our Sharpe is lower at an average of 1.12 due to much higher volatility which may be due to stocks being inherently more volatile than futures and indices. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.12516v1-abstract-full').style.display = 'none'; document.getElementById('2412.12516v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.11599">arXiv:2412.11599</a> <span> [<a href="https://arxiv.org/pdf/2412.11599">pdf</a>, <a href="https://arxiv.org/format/2412.11599">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> 3D$^2$-Actor: Learning Pose-Conditioned 3D-Aware Denoiser for Realistic Gaussian Avatar Modeling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tang%2C+Z">Zichen Tang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+H">Hongyu Yang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hanchen Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jiaxin Chen</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+D">Di Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.11599v1-abstract-short" style="display: inline;"> Advancements in neural implicit representations and differentiable rendering have markedly improved the ability to learn animatable 3D avatars from sparse multi-view RGB videos. However, current methods that map observation space to canonical space often face challenges in capturing pose-dependent details and generalizing to novel poses. While diffusion models have demonstrated remarkable zero-sho… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.11599v1-abstract-full').style.display = 'inline'; document.getElementById('2412.11599v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.11599v1-abstract-full" style="display: none;"> Advancements in neural implicit representations and differentiable rendering have markedly improved the ability to learn animatable 3D avatars from sparse multi-view RGB videos. However, current methods that map observation space to canonical space often face challenges in capturing pose-dependent details and generalizing to novel poses. While diffusion models have demonstrated remarkable zero-shot capabilities in 2D image generation, their potential for creating animatable 3D avatars from 2D inputs remains underexplored. In this work, we introduce 3D$^2$-Actor, a novel approach featuring a pose-conditioned 3D-aware human modeling pipeline that integrates iterative 2D denoising and 3D rectifying steps. The 2D denoiser, guided by pose cues, generates detailed multi-view images that provide the rich feature set necessary for high-fidelity 3D reconstruction and pose rendering. Complementing this, our Gaussian-based 3D rectifier renders images with enhanced 3D consistency through a two-stage projection strategy and a novel local coordinate representation. Additionally, we propose an innovative sampling strategy to ensure smooth temporal continuity across frames in video synthesis. Our method effectively addresses the limitations of traditional numerical solutions in handling ill-posed mappings, producing realistic and animatable 3D human avatars. Experimental results demonstrate that 3D$^2$-Actor excels in high-fidelity avatar modeling and robustly generalizes to novel poses. Code is available at: https://github.com/silence-tang/GaussianActor. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.11599v1-abstract-full').style.display = 'none'; document.getElementById('2412.11599v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by AAAI 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.08034">arXiv:2412.08034</a> <span> [<a href="https://arxiv.org/pdf/2412.08034">pdf</a>, <a href="https://arxiv.org/format/2412.08034">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Static-Dynamic Class-level Perception Consistency in Video Semantic Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cen%2C+Z">Zhigang Cen</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+N">Ningyan Guo</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+W">Wenjing Xu</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Z">Zhiyong Feng</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+D">Danlan Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.08034v1-abstract-short" style="display: inline;"> Video semantic segmentation(VSS) has been widely employed in lots of fields, such as simultaneous localization and mapping, autonomous driving and surveillance. Its core challenge is how to leverage temporal information to achieve better segmentation. Previous efforts have primarily focused on pixel-level static-dynamic contexts matching, utilizing techniques such as optical flow and attention mec… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.08034v1-abstract-full').style.display = 'inline'; document.getElementById('2412.08034v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.08034v1-abstract-full" style="display: none;"> Video semantic segmentation(VSS) has been widely employed in lots of fields, such as simultaneous localization and mapping, autonomous driving and surveillance. Its core challenge is how to leverage temporal information to achieve better segmentation. Previous efforts have primarily focused on pixel-level static-dynamic contexts matching, utilizing techniques such as optical flow and attention mechanisms. Instead, this paper rethinks static-dynamic contexts at the class level and proposes a novel static-dynamic class-level perceptual consistency (SD-CPC) framework. In this framework, we propose multivariate class prototype with contrastive learning and a static-dynamic semantic alignment module. The former provides class-level constraints for the model, obtaining personalized inter-class features and diversified intra-class features. The latter first establishes intra-frame spatial multi-scale and multi-level correlations to achieve static semantic alignment. Then, based on cross-frame static perceptual differences, it performs two-stage cross-frame selective aggregation to achieve dynamic semantic alignment. Meanwhile, we propose a window-based attention map calculation method that leverages the sparsity of attention points during cross-frame aggregation to reduce computation cost. Extensive experiments on VSPW and Cityscapes datasets show that the proposed approach outperforms state-of-the-art methods. Our implementation will be open-sourced on GitHub. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.08034v1-abstract-full').style.display = 'none'; document.getElementById('2412.08034v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.07168">arXiv:2412.07168</a> <span> [<a href="https://arxiv.org/pdf/2412.07168">pdf</a>, <a href="https://arxiv.org/format/2412.07168">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> 3A-YOLO: New Real-Time Object Detectors with Triple Discriminative Awareness and Coordinated Representations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+X">Xuecheng Wu</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+J">Junxiao Xue</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+L">Liangyu Fu</a>, <a href="/search/cs?searchtype=author&query=Nie%2C+J">Jiayu Nie</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+D">Danlei Huang</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+X">Xinyi Yin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.07168v1-abstract-short" style="display: inline;"> Recent research on real-time object detectors (e.g., YOLO series) has demonstrated the effectiveness of attention mechanisms for elevating model performance. Nevertheless, existing methods neglect to unifiedly deploy hierarchical attention mechanisms to construct a more discriminative YOLO head which is enriched with more useful intermediate features. To tackle this gap, this work aims to leverage… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.07168v1-abstract-full').style.display = 'inline'; document.getElementById('2412.07168v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.07168v1-abstract-full" style="display: none;"> Recent research on real-time object detectors (e.g., YOLO series) has demonstrated the effectiveness of attention mechanisms for elevating model performance. Nevertheless, existing methods neglect to unifiedly deploy hierarchical attention mechanisms to construct a more discriminative YOLO head which is enriched with more useful intermediate features. To tackle this gap, this work aims to leverage multiple attention mechanisms to hierarchically enhance the triple discriminative awareness of the YOLO detection head and complementarily learn the coordinated intermediate representations, resulting in a new series detectors denoted 3A-YOLO. Specifically, we first propose a new head denoted TDA-YOLO Module, which unifiedly enhance the representations learning of scale-awareness, spatial-awareness, and task-awareness. Secondly, we steer the intermediate features to coordinately learn the inter-channel relationships and precise positional information. Finally, we perform neck network improvements followed by introducing various tricks to boost the adaptability of 3A-YOLO. Extensive experiments across COCO and VOC benchmarks indicate the effectiveness of our detectors. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.07168v1-abstract-full').style.display = 'none'; document.getElementById('2412.07168v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.06413">arXiv:2412.06413</a> <span> [<a href="https://arxiv.org/pdf/2412.06413">pdf</a>, <a href="https://arxiv.org/format/2412.06413">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> World-Consistent Data Generation for Vision-and-Language Navigation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhong%2C+Y">Yu Zhong</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Rui Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zihao Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shuo Wang</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+C">Chuan Fang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xishan Zhang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+J">Jiaming Guo</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+S">Shaohui Peng</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+D">Di Huang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+Y">Yanyang Yan</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+X">Xing Hu</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+P">Ping Tan</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Q">Qi Guo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.06413v1-abstract-short" style="display: inline;"> Vision-and-Language Navigation (VLN) is a challenging task that requires an agent to navigate through photorealistic environments following natural-language instructions. One main obstacle existing in VLN is data scarcity, leading to poor generalization performance over unseen environments. Tough data argumentation is a promising way for scaling up the dataset, how to generate VLN data both divers… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.06413v1-abstract-full').style.display = 'inline'; document.getElementById('2412.06413v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.06413v1-abstract-full" style="display: none;"> Vision-and-Language Navigation (VLN) is a challenging task that requires an agent to navigate through photorealistic environments following natural-language instructions. One main obstacle existing in VLN is data scarcity, leading to poor generalization performance over unseen environments. Tough data argumentation is a promising way for scaling up the dataset, how to generate VLN data both diverse and world-consistent remains problematic. To cope with this issue, we propose the world-consistent data generation (WCGEN), an efficacious data-augmentation framework satisfying both diversity and world-consistency, targeting at enhancing the generalizations of agents to novel environments. Roughly, our framework consists of two stages, the trajectory stage which leverages a point-cloud based technique to ensure spatial coherency among viewpoints, and the viewpoint stage which adopts a novel angle synthesis method to guarantee spatial and wraparound consistency within the entire observation. By accurately predicting viewpoint changes with 3D knowledge, our approach maintains the world-consistency during the generation procedure. Experiments on a wide range of datasets verify the effectiveness of our method, demonstrating that our data augmentation strategy enables agents to achieve new state-of-the-art results on all navigation tasks, and is capable of enhancing the VLN agents' generalization ability to unseen environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.06413v1-abstract-full').style.display = 'none'; document.getElementById('2412.06413v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.04468">arXiv:2412.04468</a> <span> [<a href="https://arxiv.org/pdf/2412.04468">pdf</a>, <a href="https://arxiv.org/format/2412.04468">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> NVILA: Efficient Frontier Visual Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhijian Liu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+L">Ligeng Zhu</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+B">Baifeng Shi</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhuoyang Zhang</a>, <a href="/search/cs?searchtype=author&query=Lou%2C+Y">Yuming Lou</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Shang Yang</a>, <a href="/search/cs?searchtype=author&query=Xi%2C+H">Haocheng Xi</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+S">Shiyi Cao</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+Y">Yuxian Gu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Dacheng Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiuyu Li</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Y">Yunhao Fang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yukang Chen</a>, <a href="/search/cs?searchtype=author&query=Hsieh%2C+C">Cheng-Yu Hsieh</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+D">De-An Huang</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+A">An-Chieh Cheng</a>, <a href="/search/cs?searchtype=author&query=Nath%2C+V">Vishwesh Nath</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+J">Jinyi Hu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Sifei Liu</a>, <a href="/search/cs?searchtype=author&query=Krishna%2C+R">Ranjay Krishna</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+D">Daguang Xu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiaolong Wang</a>, <a href="/search/cs?searchtype=author&query=Molchanov%2C+P">Pavlo Molchanov</a>, <a href="/search/cs?searchtype=author&query=Kautz%2C+J">Jan Kautz</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+H">Hongxu Yin</a> , et al. (2 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.04468v1-abstract-short" style="display: inline;"> Visual language models (VLMs) have made significant advances in accuracy in recent years. However, their efficiency has received much less attention. This paper introduces NVILA, a family of open VLMs designed to optimize both efficiency and accuracy. Building on top of VILA, we improve its model architecture by first scaling up the spatial and temporal resolutions, and then compressing visual tok… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.04468v1-abstract-full').style.display = 'inline'; document.getElementById('2412.04468v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.04468v1-abstract-full" style="display: none;"> Visual language models (VLMs) have made significant advances in accuracy in recent years. However, their efficiency has received much less attention. This paper introduces NVILA, a family of open VLMs designed to optimize both efficiency and accuracy. Building on top of VILA, we improve its model architecture by first scaling up the spatial and temporal resolutions, and then compressing visual tokens. This "scale-then-compress" approach enables NVILA to efficiently process high-resolution images and long videos. We also conduct a systematic investigation to enhance the efficiency of NVILA throughout its entire lifecycle, from training and fine-tuning to deployment. NVILA matches or surpasses the accuracy of many leading open and proprietary VLMs across a wide range of image and video benchmarks. At the same time, it reduces training costs by 4.5X, fine-tuning memory usage by 3.4X, pre-filling latency by 1.6-2.2X, and decoding latency by 1.2-2.8X. We will soon make our code and models available to facilitate reproducibility. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.04468v1-abstract-full').style.display = 'none'; document.getElementById('2412.04468v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.03603">arXiv:2412.03603</a> <span> [<a href="https://arxiv.org/pdf/2412.03603">pdf</a>, <a href="https://arxiv.org/format/2412.03603">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> HunyuanVideo: A Systematic Framework For Large Video Generative Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kong%2C+W">Weijie Kong</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+Q">Qi Tian</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zijian Zhang</a>, <a href="/search/cs?searchtype=author&query=Min%2C+R">Rox Min</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+Z">Zuozhuo Dai</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jin Zhou</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+J">Jiangfeng Xiong</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xin Li</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+B">Bo Wu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jianwei Zhang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+K">Kathrina Wu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Q">Qin Lin</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+J">Junkun Yuan</a>, <a href="/search/cs?searchtype=author&query=Long%2C+Y">Yanxin Long</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+A">Aladdin Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+A">Andong Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Changlin Li</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+D">Duojun Huang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+F">Fang Yang</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+H">Hao Tan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hongmei Wang</a>, <a href="/search/cs?searchtype=author&query=Song%2C+J">Jacob Song</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+J">Jiawang Bai</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jianbing Wu</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+J">Jinbao Xue</a> , et al. (27 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.03603v4-abstract-short" style="display: inline;"> Recent advancements in video generation have significantly impacted daily life for both individuals and industries. However, the leading video generation models remain closed-source, resulting in a notable performance gap between industry capabilities and those available to the public. In this report, we introduce HunyuanVideo, an innovative open-source video foundation model that demonstrates per… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.03603v4-abstract-full').style.display = 'inline'; document.getElementById('2412.03603v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.03603v4-abstract-full" style="display: none;"> Recent advancements in video generation have significantly impacted daily life for both individuals and industries. However, the leading video generation models remain closed-source, resulting in a notable performance gap between industry capabilities and those available to the public. In this report, we introduce HunyuanVideo, an innovative open-source video foundation model that demonstrates performance in video generation comparable to, or even surpassing, that of leading closed-source models. HunyuanVideo encompasses a comprehensive framework that integrates several key elements, including data curation, advanced architectural design, progressive model scaling and training, and an efficient infrastructure tailored for large-scale model training and inference. As a result, we successfully trained a video generative model with over 13 billion parameters, making it the largest among all open-source models. We conducted extensive experiments and implemented a series of targeted designs to ensure high visual quality, motion dynamics, text-video alignment, and advanced filming techniques. According to evaluations by professionals, HunyuanVideo outperforms previous state-of-the-art models, including Runway Gen-3, Luma 1.6, and three top-performing Chinese video generative models. By releasing the code for the foundation model and its applications, we aim to bridge the gap between closed-source and open-source communities. This initiative will empower individuals within the community to experiment with their ideas, fostering a more dynamic and vibrant video generation ecosystem. The code is publicly available at https://github.com/Tencent/HunyuanVideo. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.03603v4-abstract-full').style.display = 'none'; document.getElementById('2412.03603v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.02205">arXiv:2412.02205</a> <span> [<a href="https://arxiv.org/pdf/2412.02205">pdf</a>, <a href="https://arxiv.org/format/2412.02205">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> DataLab: A Unified Platform for LLM-Powered Business Intelligence </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Weng%2C+L">Luoxuan Weng</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Y">Yinghao Tang</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Y">Yingchaojie Feng</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+Z">Zhuo Chang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+P">Peng Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+R">Ruiqin Chen</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+H">Haozhe Feng</a>, <a href="/search/cs?searchtype=author&query=Hou%2C+C">Chen Hou</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+D">Danqing Huang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yang Li</a>, <a href="/search/cs?searchtype=author&query=Rao%2C+H">Huaming Rao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haonan Wang</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+C">Canshi Wei</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xiaofeng Yang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yuhui Zhang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Y">Yifeng Zheng</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+X">Xiuqi Huang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+M">Minfeng Zhu</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Y">Yuxin Ma</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+B">Bin Cui</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wei Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.02205v2-abstract-short" style="display: inline;"> Business intelligence (BI) transforms large volumes of data within modern organizations into actionable insights for informed decision-making. Recently, large language model (LLM)-based agents have streamlined the BI workflow by automatically performing task planning, reasoning, and actions in executable environments based on natural language (NL) queries. However, existing approaches primarily fo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.02205v2-abstract-full').style.display = 'inline'; document.getElementById('2412.02205v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.02205v2-abstract-full" style="display: none;"> Business intelligence (BI) transforms large volumes of data within modern organizations into actionable insights for informed decision-making. Recently, large language model (LLM)-based agents have streamlined the BI workflow by automatically performing task planning, reasoning, and actions in executable environments based on natural language (NL) queries. However, existing approaches primarily focus on individual BI tasks such as NL2SQL and NL2VIS. The fragmentation of tasks across different data roles and tools lead to inefficiencies and potential errors due to the iterative and collaborative nature of BI. In this paper, we introduce DataLab, a unified BI platform that integrates a one-stop LLM-based agent framework with an augmented computational notebook interface. DataLab supports a wide range of BI tasks for different data roles by seamlessly combining LLM assistance with user customization within a single environment. To achieve this unification, we design a domain knowledge incorporation module tailored for enterprise-specific BI tasks, an inter-agent communication mechanism to facilitate information sharing across the BI workflow, and a cell-based context management strategy to enhance context utilization efficiency in BI notebooks. Extensive experiments demonstrate that DataLab achieves state-of-the-art performance on various BI tasks across popular research benchmarks. Moreover, DataLab maintains high effectiveness and efficiency on real-world datasets from Tencent, achieving up to a 58.58% increase in accuracy and a 61.65% reduction in token cost on enterprise-specific BI tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.02205v2-abstract-full').style.display = 'none'; document.getElementById('2412.02205v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14571">arXiv:2411.14571</a> <span> [<a href="https://arxiv.org/pdf/2411.14571">pdf</a>, <a href="https://arxiv.org/format/2411.14571">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Assessment of LLM Responses to End-user Security Questions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Prakash%2C+V">Vijay Prakash</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+K">Kevin Lee</a>, <a href="/search/cs?searchtype=author&query=Bhattacharya%2C+A">Arkaprabha Bhattacharya</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+D+Y">Danny Yuxing Huang</a>, <a href="/search/cs?searchtype=author&query=Staddon%2C+J">Jessica Staddon</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14571v1-abstract-short" style="display: inline;"> Answering end user security questions is challenging. While large language models (LLMs) like GPT, LLAMA, and Gemini are far from error-free, they have shown promise in answering a variety of questions outside of security. We studied LLM performance in the area of end user security by qualitatively evaluating 3 popular LLMs on 900 systematically collected end user security questions. While LLMs… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14571v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14571v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14571v1-abstract-full" style="display: none;"> Answering end user security questions is challenging. While large language models (LLMs) like GPT, LLAMA, and Gemini are far from error-free, they have shown promise in answering a variety of questions outside of security. We studied LLM performance in the area of end user security by qualitatively evaluating 3 popular LLMs on 900 systematically collected end user security questions. While LLMs demonstrate broad generalist ``knowledge'' of end user security information, there are patterns of errors and limitations across LLMs consisting of stale and inaccurate answers, and indirect or unresponsive communication styles, all of which impacts the quality of information received. Based on these patterns, we suggest directions for model improvement and recommend user strategies for interacting with LLMs when seeking assistance with security. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14571v1-abstract-full').style.display = 'none'; document.getElementById('2411.14571v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">18 pages, 1 figure, 8 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13881">arXiv:2411.13881</a> <span> [<a href="https://arxiv.org/pdf/2411.13881">pdf</a>, <a href="https://arxiv.org/format/2411.13881">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Data Analysis, Statistics and Probability">physics.data-an</span> </div> </div> <p class="title is-5 mathjax"> Exploring applications of topological data analysis in stock index movement prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+D">Dazhi Huang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+P">Pengcheng Xu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+X">Xiaocheng Huang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jiayi Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13881v1-abstract-short" style="display: inline;"> Topological Data Analysis (TDA) has recently gained significant attention in the field of financial prediction. However, the choice of point cloud construction methods, topological feature representations, and classification models has a substantial impact on prediction results. This paper addresses the classification problem of stock index movement. First, we construct point clouds for stock indi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13881v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13881v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13881v1-abstract-full" style="display: none;"> Topological Data Analysis (TDA) has recently gained significant attention in the field of financial prediction. However, the choice of point cloud construction methods, topological feature representations, and classification models has a substantial impact on prediction results. This paper addresses the classification problem of stock index movement. First, we construct point clouds for stock indices using three different methods. Next, we apply TDA to extract topological structures from the point clouds. Four distinct topological features are computed to represent the patterns in the data, and 15 combinations of these features are enumerated and input into six different machine learning models. We evaluate the predictive performance of various TDA configurations by conducting index movement classification tasks on datasets such as CSI, DAX, HSI and FTSE providing insights into the efficiency of different TDA setups. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13881v1-abstract-full').style.display = 'none'; document.getElementById('2411.13881v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">20 pages, 10 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13291">arXiv:2411.13291</a> <span> [<a href="https://arxiv.org/pdf/2411.13291">pdf</a>, <a href="https://arxiv.org/format/2411.13291">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> DATAP-SfM: Dynamic-Aware Tracking Any Point for Robust Structure from Motion in the Wild </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ye%2C+W">Weicai Ye</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xinyu Chen</a>, <a href="/search/cs?searchtype=author&query=Zhan%2C+R">Ruohao Zhan</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+D">Di Huang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+X">Xiaoshui Huang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+H">Haoyi Zhu</a>, <a href="/search/cs?searchtype=author&query=Bao%2C+H">Hujun Bao</a>, <a href="/search/cs?searchtype=author&query=Ouyang%2C+W">Wanli Ouyang</a>, <a href="/search/cs?searchtype=author&query=He%2C+T">Tong He</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+G">Guofeng Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13291v1-abstract-short" style="display: inline;"> This paper proposes a concise, elegant, and robust pipeline to estimate smooth camera trajectories and obtain dense point clouds for casual videos in the wild. Traditional frameworks, such as ParticleSfM~\cite{zhao2022particlesfm}, address this problem by sequentially computing the optical flow between adjacent frames to obtain point trajectories. They then remove dynamic trajectories through moti… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13291v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13291v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13291v1-abstract-full" style="display: none;"> This paper proposes a concise, elegant, and robust pipeline to estimate smooth camera trajectories and obtain dense point clouds for casual videos in the wild. Traditional frameworks, such as ParticleSfM~\cite{zhao2022particlesfm}, address this problem by sequentially computing the optical flow between adjacent frames to obtain point trajectories. They then remove dynamic trajectories through motion segmentation and perform global bundle adjustment. However, the process of estimating optical flow between two adjacent frames and chaining the matches can introduce cumulative errors. Additionally, motion segmentation combined with single-view depth estimation often faces challenges related to scale ambiguity. To tackle these challenges, we propose a dynamic-aware tracking any point (DATAP) method that leverages consistent video depth and point tracking. Specifically, our DATAP addresses these issues by estimating dense point tracking across the video sequence and predicting the visibility and dynamics of each point. By incorporating the consistent video depth prior, the performance of motion segmentation is enhanced. With the integration of DATAP, it becomes possible to estimate and optimize all camera poses simultaneously by performing global bundle adjustments for point tracking classified as static and visible, rather than relying on incremental camera registration. Extensive experiments on dynamic sequences, e.g., Sintel and TUM RGBD dynamic sequences, and on the wild video, e.g., DAVIS, demonstrate that the proposed method achieves state-of-the-art performance in terms of camera pose estimation even in complex dynamic challenge scenes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13291v1-abstract-full').style.display = 'none'; document.getElementById('2411.13291v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12791">arXiv:2411.12791</a> <span> [<a href="https://arxiv.org/pdf/2411.12791">pdf</a>, <a href="https://arxiv.org/format/2411.12791">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Mitigating Perception Bias: A Training-Free Approach to Enhance LMM for Image Quality Assessment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Pan%2C+S">Siyi Pan</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+B">Baoliang Chen</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+D">Danni Huang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+H">Hanwei Zhu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+L">Lingyu Zhu</a>, <a href="/search/cs?searchtype=author&query=Sui%2C+X">Xiangjie Sui</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shiqi Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12791v1-abstract-short" style="display: inline;"> Despite the impressive performance of large multimodal models (LMMs) in high-level visual tasks, their capacity for image quality assessment (IQA) remains limited. One main reason is that LMMs are primarily trained for high-level tasks (e.g., image captioning), emphasizing unified image semantics extraction under varied quality. Such semantic-aware yet quality-insensitive perception bias inevitabl… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12791v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12791v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12791v1-abstract-full" style="display: none;"> Despite the impressive performance of large multimodal models (LMMs) in high-level visual tasks, their capacity for image quality assessment (IQA) remains limited. One main reason is that LMMs are primarily trained for high-level tasks (e.g., image captioning), emphasizing unified image semantics extraction under varied quality. Such semantic-aware yet quality-insensitive perception bias inevitably leads to a heavy reliance on image semantics when those LMMs are forced for quality rating. In this paper, instead of retraining or tuning an LMM costly, we propose a training-free debiasing framework, in which the image quality prediction is rectified by mitigating the bias caused by image semantics. Specifically, we first explore several semantic-preserving distortions that can significantly degrade image quality while maintaining identifiable semantics. By applying these specific distortions to the query or test images, we ensure that the degraded images are recognized as poor quality while their semantics remain. During quality inference, both a query image and its corresponding degraded version are fed to the LMM along with a prompt indicating that the query image quality should be inferred under the condition that the degraded one is deemed poor quality.This prior condition effectively aligns the LMM's quality perception, as all degraded images are consistently rated as poor quality, regardless of their semantic difference.Finally, the quality scores of the query image inferred under different prior conditions (degraded versions) are aggregated using a conditional probability model. Extensive experiments on various IQA datasets show that our debiasing framework could consistently enhance the LMM performance and the code will be publicly available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12791v1-abstract-full').style.display = 'none'; document.getElementById('2411.12791v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11091">arXiv:2411.11091</a> <span> [<a href="https://arxiv.org/pdf/2411.11091">pdf</a>, <a href="https://arxiv.org/format/2411.11091">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> </div> </div> <p class="title is-5 mathjax"> KV-Tandem -- a Modular Approach to Building High-Speed LSM Storage Engines </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bortnikov%2C+E">Edward Bortnikov</a>, <a href="/search/cs?searchtype=author&query=Azran%2C+M">Michael Azran</a>, <a href="/search/cs?searchtype=author&query=Bornstein%2C+A">Asa Bornstein</a>, <a href="/search/cs?searchtype=author&query=Dashevsky%2C+S">Shmuel Dashevsky</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+D">Dennis Huang</a>, <a href="/search/cs?searchtype=author&query=Kepten%2C+O">Omer Kepten</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+M">Michael Pan</a>, <a href="/search/cs?searchtype=author&query=Sheffi%2C+G">Gali Sheffi</a>, <a href="/search/cs?searchtype=author&query=Twitto%2C+M">Moshe Twitto</a>, <a href="/search/cs?searchtype=author&query=Orzech%2C+T+W">Tamar Weiss Orzech</a>, <a href="/search/cs?searchtype=author&query=Keidar%2C+I">Idit Keidar</a>, <a href="/search/cs?searchtype=author&query=Gueta%2C+G">Guy Gueta</a>, <a href="/search/cs?searchtype=author&query=Maor%2C+R">Roey Maor</a>, <a href="/search/cs?searchtype=author&query=Dayan%2C+N">Niv Dayan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11091v1-abstract-short" style="display: inline;"> We present~\emph{KV-Tandem}, a modular architecture for building LSM-based storage engines on top of simple, non-ordered persistent key-value stores (KVSs). KV-Tandem enables advanced functionalities such as range queries and snapshot reads, while maintaining the native KVS performance for random reads and writes. Its modular design offers better performance trade-offs compared to previous KV-sepa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11091v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11091v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11091v1-abstract-full" style="display: none;"> We present~\emph{KV-Tandem}, a modular architecture for building LSM-based storage engines on top of simple, non-ordered persistent key-value stores (KVSs). KV-Tandem enables advanced functionalities such as range queries and snapshot reads, while maintaining the native KVS performance for random reads and writes. Its modular design offers better performance trade-offs compared to previous KV-separation solutions, which struggle to decompose the monolithic LSM structure. Central to KV-Tandem is~\emph{LSM bypass} -- a novel algorithm that offers a fast path to basic operations while ensuring the correctness of advanced APIs. We implement KV-Tandem in \emph{XDP-Rocks}, a RocksDB-compatible storage engine that leverages the XDP KVS and incorporates practical design optimizations for real-world deployment. Through extensive microbenchmark and system-level comparisons, we demonstrate that XDP-Rocks achieves 3x to 4x performance improvements over RocksDB across various workloads. XDP-Rocks is already deployed in production, delivering significant operator cost savings consistent with these performance gains. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11091v1-abstract-full').style.display = 'none'; document.getElementById('2411.11091v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09349">arXiv:2411.09349</a> <span> [<a href="https://arxiv.org/pdf/2411.09349">pdf</a>, <a href="https://arxiv.org/format/2411.09349">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> ParaLBench: A Large-Scale Benchmark for Computational Paralinguistics over Acoustic Foundation Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zixing Zhang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+W">Weixiang Xu</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Z">Zhongren Dong</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+K">Kanglin Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yimeng Wu</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+J">Jing Peng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+R">Runming Wang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+D">Dong-Yan Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09349v1-abstract-short" style="display: inline;"> Computational paralinguistics (ComParal) aims to develop algorithms and models to automatically detect, analyze, and interpret non-verbal information from speech communication, e. g., emotion, health state, age, and gender. Despite its rapid progress, it heavily depends on sophisticatedly designed models given specific paralinguistic tasks. Thus, the heterogeneity and diversity of ComParal models… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09349v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09349v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09349v1-abstract-full" style="display: none;"> Computational paralinguistics (ComParal) aims to develop algorithms and models to automatically detect, analyze, and interpret non-verbal information from speech communication, e. g., emotion, health state, age, and gender. Despite its rapid progress, it heavily depends on sophisticatedly designed models given specific paralinguistic tasks. Thus, the heterogeneity and diversity of ComParal models largely prevent the realistic implementation of ComParal models. Recently, with the advent of acoustic foundation models because of self-supervised learning, developing more generic models that can efficiently perceive a plethora of paralinguistic information has become an active topic in speech processing. However, it lacks a unified evaluation framework for a fair and consistent performance comparison. To bridge this gap, we conduct a large-scale benchmark, namely ParaLBench, which concentrates on standardizing the evaluation process of diverse paralinguistic tasks, including critical aspects of affective computing such as emotion recognition and emotion dimensions prediction, over different acoustic foundation models. This benchmark contains ten datasets with thirteen distinct paralinguistic tasks, covering short-, medium- and long-term characteristics. Each task is carried out on 14 acoustic foundation models under a unified evaluation framework, which allows for an unbiased methodological comparison and offers a grounded reference for the ComParal community. Based on the insights gained from ParaLBench, we also point out potential research directions, i.e., the cross-corpus generalizability, to propel ComParal research in the future. The code associated with this study will be available to foster the transparency and replicability of this work for succeeding researchers. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09349v1-abstract-full').style.display = 'none'; document.getElementById('2411.09349v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07747">arXiv:2411.07747</a> <span> [<a href="https://arxiv.org/pdf/2411.07747">pdf</a>, <a href="https://arxiv.org/format/2411.07747">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Constraint Learning for Parametric Point Cloud </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cheng%2C+X">Xi Cheng</a>, <a href="/search/cs?searchtype=author&query=Lei%2C+R">Ruiqi Lei</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+D">Di Huang</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+Z">Zhichao Liao</a>, <a href="/search/cs?searchtype=author&query=Piao%2C+F">Fengyuan Piao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yan Chen</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+P">Pingfa Feng</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+L">Long Zeng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07747v3-abstract-short" style="display: inline;"> Parametric point clouds are sampled from CAD shapes, and have become increasingly prevalent in industrial manufacturing. However, most existing point cloud learning methods focus on the geometric features, such as developing efficient convolution operations, overlooking the important attribute of constraints inherent in CAD shapes, which limits these methods' ability to comprehend CAD shapes fully… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07747v3-abstract-full').style.display = 'inline'; document.getElementById('2411.07747v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07747v3-abstract-full" style="display: none;"> Parametric point clouds are sampled from CAD shapes, and have become increasingly prevalent in industrial manufacturing. However, most existing point cloud learning methods focus on the geometric features, such as developing efficient convolution operations, overlooking the important attribute of constraints inherent in CAD shapes, which limits these methods' ability to comprehend CAD shapes fully. To address this issue, we analyzed the effect of constraints, and proposed its deep learning-friendly representation, after that, the Constraint Feature Learning Network (CstNet) was developed to extract and leverage constraints. Our CstNet includes two stages. Stage 1 extracts constraints from B-Rep data or point cloud. Stage 2 leverages coordinates and constraints to enhance the comprehension of CAD shapes. Additionally, we built up the Parametric 20,000 Multi-modal Dataset for the scarcity of labeled B-Rep datasets. Experiments demonstrate that our CstNet achieved state-of-the-art performance on both public and proposed CAD shape datasets. To the best of our knowledge, CstNet is the first constraint-based learning method tailored for CAD shape analysis. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07747v3-abstract-full').style.display = 'none'; document.getElementById('2411.07747v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06102">arXiv:2411.06102</a> <span> [<a href="https://arxiv.org/pdf/2411.06102">pdf</a>, <a href="https://arxiv.org/format/2411.06102">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> </div> </div> <p class="title is-5 mathjax"> SiriusBI: Building End-to-End Business Intelligence Enhanced by Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+J">Jie Jiang</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+H">Haining Xie</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+Y">Yu Shen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zihan Zhang</a>, <a href="/search/cs?searchtype=author&query=Lei%2C+M">Meng Lei</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Y">Yifeng Zheng</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Y">Yide Fang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chunyou Li</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+D">Danqing Huang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wentao Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yang Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xiaofeng Yang</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+B">Bin Cui</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+P">Peng Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06102v1-abstract-short" style="display: inline;"> The rapid advancement of AI technologies, particularly Large Language Models (LLMs), is establishing a new paradigm for Business Intelligence (BI). Despite the emergence of pioneering work in enhancing BI systems with LLMs, we have identified the following three issues when deployed in real industrial scenarios: interaction limitations, performance bottlenecks, and functionality deficiencies. In… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06102v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06102v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06102v1-abstract-full" style="display: none;"> The rapid advancement of AI technologies, particularly Large Language Models (LLMs), is establishing a new paradigm for Business Intelligence (BI). Despite the emergence of pioneering work in enhancing BI systems with LLMs, we have identified the following three issues when deployed in real industrial scenarios: interaction limitations, performance bottlenecks, and functionality deficiencies. In this paper, we present SiriusBI, an end-to-end business intelligence system that is designed to address the three issues simultaneously. First, we propose an intelligent and application-oriented module called multi-round dialogue with querying, which aims to overcome the prevalent interaction limitations in current BI solutions. Next, to mitigate the performance bottlenecks caused by scenario migration, we introduce two SQL generation methods that strike a balance between accuracy and deployment costs. Finally, to tackle the practical challenges posed by functionality deficiencies, we develop an end-to-end workflow that covers the entire BI process, ensuring that SiriusBI delivers a robust and complete set of functionalities. As an independent cloud service in Tencent's data platform, SiriusBI has been applied across Tencent's finance, advertising, and cloud sectors, providing services to dozens of enterprise clients. Experiments on real-world datasets and practical applications in industrial BI scenarios demonstrate the practicality and effectiveness of SiriusBI. Remarkably, SiriusBI achieves remarkable accuracy rates of 97% in SQL generation for Tencent Finance, 89% for Tencent Advertisement, and 91% for Tencent Cloud. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06102v1-abstract-full').style.display = 'none'; document.getElementById('2411.06102v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">14 pages, 5figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02861">arXiv:2411.02861</a> <span> [<a href="https://arxiv.org/pdf/2411.02861">pdf</a>, <a href="https://arxiv.org/format/2411.02861">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Centerness-based Instance-aware Knowledge Distillation with Task-wise Mutual Lifting for Object Detection on Drone Imagery </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Du%2C+B">Bowei Du</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+Z">Zhixuan Liao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yanan Zhang</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+Z">Zhi Cai</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jiaxin Chen</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+D">Di Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02861v1-abstract-short" style="display: inline;"> Developing accurate and efficient detectors for drone imagery is challenging due to the inherent complexity of aerial scenes. While some existing methods aim to achieve high accuracy by utilizing larger models, their computational cost is prohibitive for drones. Recently, Knowledge Distillation (KD) has shown promising potential for maintaining satisfactory accuracy while significantly compressing… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02861v1-abstract-full').style.display = 'inline'; document.getElementById('2411.02861v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02861v1-abstract-full" style="display: none;"> Developing accurate and efficient detectors for drone imagery is challenging due to the inherent complexity of aerial scenes. While some existing methods aim to achieve high accuracy by utilizing larger models, their computational cost is prohibitive for drones. Recently, Knowledge Distillation (KD) has shown promising potential for maintaining satisfactory accuracy while significantly compressing models in general object detection. Considering the advantages of KD, this paper presents the first attempt to adapt it to object detection on drone imagery and addresses two intrinsic issues: (1) low foreground-background ratio and (2) small instances and complex backgrounds, which lead to inadequate training, resulting insufficient distillation. Therefore, we propose a task-wise Lightweight Mutual Lifting (Light-ML) module with a Centerness-based Instance-aware Distillation (CID) strategy. The Light-ML module mutually harmonizes the classification and localization branches by channel shuffling and convolution, integrating teacher supervision across different tasks during back-propagation, thus facilitating training the student model. The CID strategy extracts valuable regions surrounding instances through the centerness of proposals, enhancing distillation efficacy. Experiments on the VisDrone, UAVDT, and COCO benchmarks demonstrate that the proposed approach promotes the accuracies of existing state-of-the-art KD methods with comparable computational requirements. Codes will be available upon acceptance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02861v1-abstract-full').style.display = 'none'; document.getElementById('2411.02861v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02064">arXiv:2411.02064</a> <span> [<a href="https://arxiv.org/pdf/2411.02064">pdf</a>, <a href="https://arxiv.org/format/2411.02064">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Amortized Bayesian Experimental Design for Decision-Making </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+D">Daolang Huang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Y">Yujia Guo</a>, <a href="/search/cs?searchtype=author&query=Acerbi%2C+L">Luigi Acerbi</a>, <a href="/search/cs?searchtype=author&query=Kaski%2C+S">Samuel Kaski</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02064v2-abstract-short" style="display: inline;"> Many critical decisions, such as personalized medical diagnoses and product pricing, are made based on insights gained from designing, observing, and analyzing a series of experiments. This highlights the crucial role of experimental design, which goes beyond merely collecting information on system parameters as in traditional Bayesian experimental design (BED), but also plays a key part in facili… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02064v2-abstract-full').style.display = 'inline'; document.getElementById('2411.02064v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02064v2-abstract-full" style="display: none;"> Many critical decisions, such as personalized medical diagnoses and product pricing, are made based on insights gained from designing, observing, and analyzing a series of experiments. This highlights the crucial role of experimental design, which goes beyond merely collecting information on system parameters as in traditional Bayesian experimental design (BED), but also plays a key part in facilitating downstream decision-making. Most recent BED methods use an amortized policy network to rapidly design experiments. However, the information gathered through these methods is suboptimal for down-the-line decision-making, as the experiments are not inherently designed with downstream objectives in mind. In this paper, we present an amortized decision-aware BED framework that prioritizes maximizing downstream decision utility. We introduce a novel architecture, the Transformer Neural Decision Process (TNDP), capable of instantly proposing the next experimental design, whilst inferring the downstream decision, thus effectively amortizing both tasks within a unified workflow. We demonstrate the performance of our method across several tasks, showing that it can deliver informative designs and facilitate accurate decision-making. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02064v2-abstract-full').style.display = 'none'; document.getElementById('2411.02064v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">20 pages, 6 figures. Accepted at the 38th Conference on Neural Information Processing Systems (NeurIPS 2024)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23262">arXiv:2410.23262</a> <span> [<a href="https://arxiv.org/pdf/2410.23262">pdf</a>, <a href="https://arxiv.org/format/2410.23262">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> EMMA: End-to-End Multimodal Model for Autonomous Driving </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hwang%2C+J">Jyh-Jing Hwang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+R">Runsheng Xu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H">Hubert Lin</a>, <a href="/search/cs?searchtype=author&query=Hung%2C+W">Wei-Chih Hung</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+J">Jingwei Ji</a>, <a href="/search/cs?searchtype=author&query=Choi%2C+K">Kristy Choi</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+D">Di Huang</a>, <a href="/search/cs?searchtype=author&query=He%2C+T">Tong He</a>, <a href="/search/cs?searchtype=author&query=Covington%2C+P">Paul Covington</a>, <a href="/search/cs?searchtype=author&query=Sapp%2C+B">Benjamin Sapp</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yin Zhou</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+J">James Guo</a>, <a href="/search/cs?searchtype=author&query=Anguelov%2C+D">Dragomir Anguelov</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+M">Mingxing Tan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23262v2-abstract-short" style="display: inline;"> We introduce EMMA, an End-to-end Multimodal Model for Autonomous driving. Built on a multi-modal large language model foundation, EMMA directly maps raw camera sensor data into various driving-specific outputs, including planner trajectories, perception objects, and road graph elements. EMMA maximizes the utility of world knowledge from the pre-trained large language models, by representing all no… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23262v2-abstract-full').style.display = 'inline'; document.getElementById('2410.23262v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23262v2-abstract-full" style="display: none;"> We introduce EMMA, an End-to-end Multimodal Model for Autonomous driving. Built on a multi-modal large language model foundation, EMMA directly maps raw camera sensor data into various driving-specific outputs, including planner trajectories, perception objects, and road graph elements. EMMA maximizes the utility of world knowledge from the pre-trained large language models, by representing all non-sensor inputs (e.g. navigation instructions and ego vehicle status) and outputs (e.g. trajectories and 3D locations) as natural language text. This approach allows EMMA to jointly process various driving tasks in a unified language space, and generate the outputs for each task using task-specific prompts. Empirically, we demonstrate EMMA's effectiveness by achieving state-of-the-art performance in motion planning on nuScenes as well as competitive results on the Waymo Open Motion Dataset (WOMD). EMMA also yields competitive results for camera-primary 3D object detection on the Waymo Open Dataset (WOD). We show that co-training EMMA with planner trajectories, object detection, and road graph tasks yields improvements across all three domains, highlighting EMMA's potential as a generalist model for autonomous driving applications. However, EMMA also exhibits certain limitations: it can process only a small amount of image frames, does not incorporate accurate 3D sensing modalities like LiDAR or radar and is computationally expensive. We hope that our results will inspire further research to mitigate these issues and to further evolve the state of the art in autonomous driving model architectures. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23262v2-abstract-full').style.display = 'none'; document.getElementById('2410.23262v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Blog post: https://waymo.com/blog/2024/10/introducing-emma/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.21747">arXiv:2410.21747</a> <span> [<a href="https://arxiv.org/pdf/2410.21747">pdf</a>, <a href="https://arxiv.org/format/2410.21747">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MotionGPT-2: A General-Purpose Motion-Language Model for Motion Generation and Understanding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yuan Wang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+D">Di Huang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yaqi Zhang</a>, <a href="/search/cs?searchtype=author&query=Ouyang%2C+W">Wanli Ouyang</a>, <a href="/search/cs?searchtype=author&query=Jiao%2C+J">Jile Jiao</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+X">Xuetao Feng</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yan Zhou</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+P">Pengfei Wan</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+S">Shixiang Tang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+D">Dan Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.21747v1-abstract-short" style="display: inline;"> Generating lifelike human motions from descriptive texts has experienced remarkable research focus in the recent years, propelled by the emerging requirements of digital humans.Despite impressive advances, existing approaches are often constrained by limited control modalities, task specificity, and focus solely on body motion representations.In this paper, we present MotionGPT-2, a unified Large… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21747v1-abstract-full').style.display = 'inline'; document.getElementById('2410.21747v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.21747v1-abstract-full" style="display: none;"> Generating lifelike human motions from descriptive texts has experienced remarkable research focus in the recent years, propelled by the emerging requirements of digital humans.Despite impressive advances, existing approaches are often constrained by limited control modalities, task specificity, and focus solely on body motion representations.In this paper, we present MotionGPT-2, a unified Large Motion-Language Model (LMLM) that addresses these limitations. MotionGPT-2 accommodates multiple motion-relevant tasks and supporting multimodal control conditions through pre-trained Large Language Models (LLMs). It quantizes multimodal inputs-such as text and single-frame poses-into discrete, LLM-interpretable tokens, seamlessly integrating them into the LLM's vocabulary. These tokens are then organized into unified prompts, guiding the LLM to generate motion outputs through a pretraining-then-finetuning paradigm. We also show that the proposed MotionGPT-2 is highly adaptable to the challenging 3D holistic motion generation task, enabled by the innovative motion discretization framework, Part-Aware VQVAE, which ensures fine-grained representations of body and hand movements. Extensive experiments and visualizations validate the effectiveness of our method, demonstrating the adaptability of MotionGPT-2 across motion generation, motion captioning, and generalized motion completion tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21747v1-abstract-full').style.display = 'none'; document.getElementById('2410.21747v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.21419">arXiv:2410.21419</a> <span> [<a href="https://arxiv.org/pdf/2410.21419">pdf</a>, <a href="https://arxiv.org/format/2410.21419">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> High-Dimensional Gaussian Process Regression with Soft Kernel Interpolation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cama%C3%B1o%2C+C">Chris Cama帽o</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+D">Daniel Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.21419v1-abstract-short" style="display: inline;"> We introduce Soft Kernel Interpolation (SoftKI) designed for scalable Gaussian Process (GP) regression on high-dimensional datasets. Inspired by Structured Kernel Interpolation (SKI), which approximates a GP kernel via interpolation from a structured lattice, SoftKI approximates a kernel via softmax interpolation from a smaller number of learned interpolation (i.e, inducing) points. By abandoning… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21419v1-abstract-full').style.display = 'inline'; document.getElementById('2410.21419v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.21419v1-abstract-full" style="display: none;"> We introduce Soft Kernel Interpolation (SoftKI) designed for scalable Gaussian Process (GP) regression on high-dimensional datasets. Inspired by Structured Kernel Interpolation (SKI), which approximates a GP kernel via interpolation from a structured lattice, SoftKI approximates a kernel via softmax interpolation from a smaller number of learned interpolation (i.e, inducing) points. By abandoning the lattice structure used in SKI-based methods, SoftKI separates the cost of forming an approximate GP kernel from the dimensionality of the data, making it well-suited for high-dimensional datasets. We demonstrate the effectiveness of SoftKI across various examples, and demonstrate that its accuracy exceeds that of other scalable GP methods when the data dimensionality is modest (around $10$). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21419v1-abstract-full').style.display = 'none'; document.getElementById('2410.21419v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">14 pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18962">arXiv:2410.18962</a> <span> [<a href="https://arxiv.org/pdf/2410.18962">pdf</a>, <a href="https://arxiv.org/format/2410.18962">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Where Am I and What Will I See: An Auto-Regressive Model for Spatial Localization and View Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+J">Junyi Chen</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+D">Di Huang</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+W">Weicai Ye</a>, <a href="/search/cs?searchtype=author&query=Ouyang%2C+W">Wanli Ouyang</a>, <a href="/search/cs?searchtype=author&query=He%2C+T">Tong He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18962v1-abstract-short" style="display: inline;"> Spatial intelligence is the ability of a machine to perceive, reason, and act in three dimensions within space and time. Recent advancements in large-scale auto-regressive models have demonstrated remarkable capabilities across various reasoning tasks. However, these models often struggle with fundamental aspects of spatial reasoning, particularly in answering questions like "Where am I?" and "Wha… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18962v1-abstract-full').style.display = 'inline'; document.getElementById('2410.18962v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18962v1-abstract-full" style="display: none;"> Spatial intelligence is the ability of a machine to perceive, reason, and act in three dimensions within space and time. Recent advancements in large-scale auto-regressive models have demonstrated remarkable capabilities across various reasoning tasks. However, these models often struggle with fundamental aspects of spatial reasoning, particularly in answering questions like "Where am I?" and "What will I see?". While some attempts have been done, existing approaches typically treat them as separate tasks, failing to capture their interconnected nature. In this paper, we present Generative Spatial Transformer (GST), a novel auto-regressive framework that jointly addresses spatial localization and view prediction. Our model simultaneously estimates the camera pose from a single image and predicts the view from a new camera pose, effectively bridging the gap between spatial awareness and visual prediction. The proposed innovative camera tokenization method enables the model to learn the joint distribution of 2D projections and their corresponding spatial perspectives in an auto-regressive manner. This unified training paradigm demonstrates that joint optimization of pose estimation and novel view synthesis leads to improved performance in both tasks, for the first time, highlighting the inherent relationship between spatial awareness and visual prediction. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18962v1-abstract-full').style.display = 'none'; document.getElementById('2410.18962v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.15320">arXiv:2410.15320</a> <span> [<a href="https://arxiv.org/pdf/2410.15320">pdf</a>, <a href="https://arxiv.org/format/2410.15320">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Amortized Probabilistic Conditioning for Optimization, Simulation and Inference </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chang%2C+P+E">Paul E. Chang</a>, <a href="/search/cs?searchtype=author&query=Loka%2C+N">Nasrulloh Loka</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+D">Daolang Huang</a>, <a href="/search/cs?searchtype=author&query=Remes%2C+U">Ulpu Remes</a>, <a href="/search/cs?searchtype=author&query=Kaski%2C+S">Samuel Kaski</a>, <a href="/search/cs?searchtype=author&query=Acerbi%2C+L">Luigi Acerbi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.15320v1-abstract-short" style="display: inline;"> Amortized meta-learning methods based on pre-training have propelled fields like natural language processing and vision. Transformer-based neural processes and their variants are leading models for probabilistic meta-learning with a tractable objective. Often trained on synthetic data, these models implicitly capture essential latent information in the data-generation process. However, existing me… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15320v1-abstract-full').style.display = 'inline'; document.getElementById('2410.15320v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.15320v1-abstract-full" style="display: none;"> Amortized meta-learning methods based on pre-training have propelled fields like natural language processing and vision. Transformer-based neural processes and their variants are leading models for probabilistic meta-learning with a tractable objective. Often trained on synthetic data, these models implicitly capture essential latent information in the data-generation process. However, existing methods do not allow users to flexibly inject (condition on) and extract (predict) this probabilistic latent information at runtime, which is key to many tasks. We introduce the Amortized Conditioning Engine (ACE), a new transformer-based meta-learning model that explicitly represents latent variables of interest. ACE affords conditioning on both observed data and interpretable latent variables, the inclusion of priors at runtime, and outputs predictive distributions for discrete and continuous data and latents. We show ACE's modeling flexibility and performance in diverse tasks such as image completion and classification, Bayesian optimization, and simulation-based inference. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15320v1-abstract-full').style.display = 'none'; document.getElementById('2410.15320v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">33 pages, 21 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.13925">arXiv:2410.13925</a> <span> [<a href="https://arxiv.org/pdf/2410.13925">pdf</a>, <a href="https://arxiv.org/format/2410.13925">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> FiTv2: Scalable and Improved Flexible Vision Transformer for Diffusion Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Z">ZiDong Wang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Z">Zeyu Lu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+D">Di Huang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+C">Cai Zhou</a>, <a href="/search/cs?searchtype=author&query=Ouyang%2C+W">Wanli Ouyang</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+a+L">and Lei Bai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.13925v1-abstract-short" style="display: inline;"> \textit{Nature is infinitely resolution-free}. In the context of this reality, existing diffusion models, such as Diffusion Transformers, often face challenges when processing image resolutions outside of their trained domain. To address this limitation, we conceptualize images as sequences of tokens with dynamic sizes, rather than traditional methods that perceive images as fixed-resolution grids… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13925v1-abstract-full').style.display = 'inline'; document.getElementById('2410.13925v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.13925v1-abstract-full" style="display: none;"> \textit{Nature is infinitely resolution-free}. In the context of this reality, existing diffusion models, such as Diffusion Transformers, often face challenges when processing image resolutions outside of their trained domain. To address this limitation, we conceptualize images as sequences of tokens with dynamic sizes, rather than traditional methods that perceive images as fixed-resolution grids. This perspective enables a flexible training strategy that seamlessly accommodates various aspect ratios during both training and inference, thus promoting resolution generalization and eliminating biases introduced by image cropping. On this basis, we present the \textbf{Flexible Vision Transformer} (FiT), a transformer architecture specifically designed for generating images with \textit{unrestricted resolutions and aspect ratios}. We further upgrade the FiT to FiTv2 with several innovative designs, includingthe Query-Key vector normalization, the AdaLN-LoRA module, a rectified flow scheduler, and a Logit-Normal sampler. Enhanced by a meticulously adjusted network structure, FiTv2 exhibits $2\times$ convergence speed of FiT. When incorporating advanced training-free extrapolation techniques, FiTv2 demonstrates remarkable adaptability in both resolution extrapolation and diverse resolution generation. Additionally, our exploration of the scalability of the FiTv2 model reveals that larger models exhibit better computational efficiency. Furthermore, we introduce an efficient post-training strategy to adapt a pre-trained model for the high-resolution generation. Comprehensive experiments demonstrate the exceptional performance of FiTv2 across a broad range of resolutions. We have released all the codes and models at \url{https://github.com/whlzy/FiT} to promote the exploration of diffusion transformer models for arbitrary-resolution image generation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13925v1-abstract-full').style.display = 'none'; document.getElementById('2410.13925v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">arXiv admin note: text overlap with arXiv:2402.12376</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.10815">arXiv:2410.10815</a> <span> [<a href="https://arxiv.org/pdf/2410.10815">pdf</a>, <a href="https://arxiv.org/format/2410.10815">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Depth Any Video with Scalable Synthetic Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+H">Honghui Yang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+D">Di Huang</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+W">Wei Yin</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+C">Chunhua Shen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Haifeng Liu</a>, <a href="/search/cs?searchtype=author&query=He%2C+X">Xiaofei He</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+B">Binbin Lin</a>, <a href="/search/cs?searchtype=author&query=Ouyang%2C+W">Wanli Ouyang</a>, <a href="/search/cs?searchtype=author&query=He%2C+T">Tong He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.10815v1-abstract-short" style="display: inline;"> Video depth estimation has long been hindered by the scarcity of consistent and scalable ground truth data, leading to inconsistent and unreliable results. In this paper, we introduce Depth Any Video, a model that tackles the challenge through two key innovations. First, we develop a scalable synthetic data pipeline, capturing real-time video depth data from diverse synthetic environments, yieldin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10815v1-abstract-full').style.display = 'inline'; document.getElementById('2410.10815v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.10815v1-abstract-full" style="display: none;"> Video depth estimation has long been hindered by the scarcity of consistent and scalable ground truth data, leading to inconsistent and unreliable results. In this paper, we introduce Depth Any Video, a model that tackles the challenge through two key innovations. First, we develop a scalable synthetic data pipeline, capturing real-time video depth data from diverse synthetic environments, yielding 40,000 video clips of 5-second duration, each with precise depth annotations. Second, we leverage the powerful priors of generative video diffusion models to handle real-world videos effectively, integrating advanced techniques such as rotary position encoding and flow matching to further enhance flexibility and efficiency. Unlike previous models, which are limited to fixed-length video sequences, our approach introduces a novel mixed-duration training strategy that handles videos of varying lengths and performs robustly across different frame rates-even on single frames. At inference, we propose a depth interpolation method that enables our model to infer high-resolution video depth across sequences of up to 150 frames. Our model outperforms all previous generative depth models in terms of spatial accuracy and temporal consistency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10815v1-abstract-full').style.display = 'none'; document.getElementById('2410.10815v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project Page: https://depthanyvideo.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.10209">arXiv:2410.10209</a> <span> [<a href="https://arxiv.org/pdf/2410.10209">pdf</a>, <a href="https://arxiv.org/format/2410.10209">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> Effi-Code: Unleashing Code Efficiency in Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+D">Dong Huang</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+G">Guangtao Zeng</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+J">Jianbo Dai</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+M">Meng Luo</a>, <a href="/search/cs?searchtype=author&query=Weng%2C+H">Han Weng</a>, <a href="/search/cs?searchtype=author&query=Qing%2C+Y">Yuhao Qing</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+H">Heming Cui</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Z">Zhijiang Guo</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J+M">Jie M. Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.10209v2-abstract-short" style="display: inline;"> As the use of large language models (LLMs) for code generation becomes more prevalent in software development, it is critical to enhance both the efficiency and correctness of the generated code. Existing methods and models primarily focus on the correctness of LLM-generated code, ignoring efficiency. In this work, we present Effi-Code, an approach to enhancing code generation in LLMs that can imp… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10209v2-abstract-full').style.display = 'inline'; document.getElementById('2410.10209v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.10209v2-abstract-full" style="display: none;"> As the use of large language models (LLMs) for code generation becomes more prevalent in software development, it is critical to enhance both the efficiency and correctness of the generated code. Existing methods and models primarily focus on the correctness of LLM-generated code, ignoring efficiency. In this work, we present Effi-Code, an approach to enhancing code generation in LLMs that can improve both efficiency and correctness. We introduce a Self-Optimization process based on Overhead Profiling that leverages open-source LLMs to generate a high-quality dataset of correct and efficient code samples. This dataset is then used to fine-tune various LLMs. Our method involves the iterative refinement of generated code, guided by runtime performance metrics and correctness checks. Extensive experiments demonstrate that models fine-tuned on the Effi-Code show significant improvements in both code correctness and efficiency across task types. For example, the pass@1 of DeepSeek-Coder-6.7B-Instruct generated code increases from \textbf{43.3\%} to \textbf{76.8\%}, and the average execution time for the same correct tasks decreases by \textbf{30.5\%}. Effi-Code offers a scalable and generalizable approach to improving code generation in AI systems, with potential applications in software development, algorithm design, and computational problem-solving. The source code of Effi-Code was released in \url{https://github.com/huangd1999/Effi-Code}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10209v2-abstract-full').style.display = 'none'; document.getElementById('2410.10209v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Under Review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.08490">arXiv:2410.08490</a> <span> [<a href="https://arxiv.org/pdf/2410.08490">pdf</a>, <a href="https://arxiv.org/format/2410.08490">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> CAS-GAN for Contrast-free Angiography Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+D">De-Xing Huang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+X">Xiao-Hu Zhou</a>, <a href="/search/cs?searchtype=author&query=Gui%2C+M">Mei-Jiang Gui</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+X">Xiao-Liang Xie</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Shi-Qi Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shuang-Yi Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Hao Li</a>, <a href="/search/cs?searchtype=author&query=Xiang%2C+T">Tian-Yu Xiang</a>, <a href="/search/cs?searchtype=author&query=Hou%2C+Z">Zeng-Guang Hou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.08490v3-abstract-short" style="display: inline;"> Iodinated contrast agents are widely utilized in numerous interventional procedures, yet posing substantial health risks to patients. This paper presents CAS-GAN, a novel GAN framework that serves as a "virtual contrast agent" to synthesize X-ray angiographies via disentanglement representation learning and vessel semantic guidance, thereby reducing the reliance on iodinated contrast agents during… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08490v3-abstract-full').style.display = 'inline'; document.getElementById('2410.08490v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.08490v3-abstract-full" style="display: none;"> Iodinated contrast agents are widely utilized in numerous interventional procedures, yet posing substantial health risks to patients. This paper presents CAS-GAN, a novel GAN framework that serves as a "virtual contrast agent" to synthesize X-ray angiographies via disentanglement representation learning and vessel semantic guidance, thereby reducing the reliance on iodinated contrast agents during interventional procedures. Specifically, our approach disentangles X-ray angiographies into background and vessel components, leveraging medical prior knowledge. A specialized predictor then learns to map the interrelationships between these components. Additionally, a vessel semantic-guided generator and a corresponding loss function are introduced to enhance the visual fidelity of generated images. Experimental results on the XCAD dataset demonstrate the state-of-the-art performance of our CAS-GAN, achieving a FID of 5.87 and a MMD of 0.016. These promising results highlight CAS-GAN's potential for clinical applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08490v3-abstract-full').style.display = 'none'; document.getElementById('2410.08490v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">IEEE Symposium Series on Computational Intelligence (SSCI 2025)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.07930">arXiv:2410.07930</a> <span> [<a href="https://arxiv.org/pdf/2410.07930">pdf</a>, <a href="https://arxiv.org/format/2410.07930">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation">stat.CO</span> </div> </div> <p class="title is-5 mathjax"> Cost-aware Simulation-based Inference </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bharti%2C+A">Ayush Bharti</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+D">Daolang Huang</a>, <a href="/search/cs?searchtype=author&query=Kaski%2C+S">Samuel Kaski</a>, <a href="/search/cs?searchtype=author&query=Briol%2C+F">Fran莽ois-Xavier Briol</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.07930v1-abstract-short" style="display: inline;"> Simulation-based inference (SBI) is the preferred framework for estimating parameters of intractable models in science and engineering. A significant challenge in this context is the large computational cost of simulating data from complex models, and the fact that this cost often depends on parameter values. We therefore propose \textit{cost-aware SBI methods} which can significantly reduce the c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07930v1-abstract-full').style.display = 'inline'; document.getElementById('2410.07930v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.07930v1-abstract-full" style="display: none;"> Simulation-based inference (SBI) is the preferred framework for estimating parameters of intractable models in science and engineering. A significant challenge in this context is the large computational cost of simulating data from complex models, and the fact that this cost often depends on parameter values. We therefore propose \textit{cost-aware SBI methods} which can significantly reduce the cost of existing sampling-based SBI methods, such as neural SBI and approximate Bayesian computation. This is achieved through a combination of rejection and self-normalised importance sampling, which significantly reduces the number of expensive simulations needed. Our approach is studied extensively on models from epidemiology to telecommunications engineering, where we obtain significant reductions in the overall cost of inference. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07930v1-abstract-full').style.display = 'none'; document.getElementById('2410.07930v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Huang%2C+D&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Huang%2C+D&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Huang%2C+D&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Huang%2C+D&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Huang%2C+D&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Huang%2C+D&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository