Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 1,519 results for author: <span class="mathjax">Gao, Y</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Gao%2C+Y">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Gao, Y"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Gao%2C+Y&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Gao, Y"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Gao%2C+Y&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Gao%2C+Y&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Gao%2C+Y&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Gao%2C+Y&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Gao%2C+Y&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Gao%2C+Y&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.17605">arXiv:2411.17605</a> <span> [<a href="https://arxiv.org/pdf/2411.17605">pdf</a>, <a href="https://arxiv.org/format/2411.17605">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Distractor-free Generalizable 3D Gaussian Splatting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bao%2C+Y">Yanqi Bao</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+J">Jing Liao</a>, <a href="/search/cs?searchtype=author&query=Huo%2C+J">Jing Huo</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yang Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.17605v1-abstract-short" style="display: inline;"> We present DGGS, a novel framework addressing the previously unexplored challenge of Distractor-free Generalizable 3D Gaussian Splatting (3DGS). It accomplishes two key objectives: fortifying generalizable 3DGS against distractor-laden data during both training and inference phases, while successfully extending cross-scene adaptation capabilities to conventional distractor-free approaches. To achi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17605v1-abstract-full').style.display = 'inline'; document.getElementById('2411.17605v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.17605v1-abstract-full" style="display: none;"> We present DGGS, a novel framework addressing the previously unexplored challenge of Distractor-free Generalizable 3D Gaussian Splatting (3DGS). It accomplishes two key objectives: fortifying generalizable 3DGS against distractor-laden data during both training and inference phases, while successfully extending cross-scene adaptation capabilities to conventional distractor-free approaches. To achieve these objectives, DGGS introduces a scene-agnostic reference-based mask prediction and refinement methodology during training phase, coupled with a training view selection strategy, effectively improving distractor prediction accuracy and training stability. Moreover, to address distractor-induced voids and artifacts during inference stage, we propose a two-stage inference framework for better reference selection based on the predicted distractor masks, complemented by a distractor pruning module to eliminate residual distractor effects. Extensive generalization experiments demonstrate DGGS's advantages under distractor-laden conditions. Additionally, experimental results show that our scene-agnostic mask inference achieves accuracy comparable to scene-specific trained methods. Homepage is \url{https://github.com/bbbbby-99/DGGS}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17605v1-abstract-full').style.display = 'none'; document.getElementById('2411.17605v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.17296">arXiv:2411.17296</a> <span> [<a href="https://arxiv.org/pdf/2411.17296">pdf</a>, <a href="https://arxiv.org/format/2411.17296">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> GrokFormer: Graph Fourier Kolmogorov-Arnold Transformers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ai%2C+G">Guoguo Ai</a>, <a href="/search/cs?searchtype=author&query=Pang%2C+G">Guansong Pang</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+H">Hezhe Qiao</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yuan Gao</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+H">Hui Yan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.17296v1-abstract-short" style="display: inline;"> Graph Transformers (GTs) have demonstrated remarkable performance in incorporating various graph structure information, e.g., long-range structural dependency, into graph representation learning. However, self-attention -- the core module of GTs -- preserves only low-frequency signals on graph features, retaining only homophilic patterns that capture similar features among the connected nodes. Con… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17296v1-abstract-full').style.display = 'inline'; document.getElementById('2411.17296v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.17296v1-abstract-full" style="display: none;"> Graph Transformers (GTs) have demonstrated remarkable performance in incorporating various graph structure information, e.g., long-range structural dependency, into graph representation learning. However, self-attention -- the core module of GTs -- preserves only low-frequency signals on graph features, retaining only homophilic patterns that capture similar features among the connected nodes. Consequently, it has insufficient capacity in modeling complex node label patterns, such as the opposite of homophilic patterns -- heterophilic patterns. Some improved GTs deal with the problem by learning polynomial filters or performing self-attention over the first-order graph spectrum. However, these GTs either ignore rich information contained in the whole spectrum or neglect higher-order spectrum information, resulting in limited flexibility and frequency response in their spectral filters. To tackle these challenges, we propose a novel GT network, namely Graph Fourier Kolmogorov-Arnold Transformers (GrokFormer), to go beyond the self-attention in GTs. GrokFormer leverages learnable activation functions in order-$K$ graph spectrum through Fourier series modeling to i) learn eigenvalue-targeted filter functions producing learnable base that can capture a broad range of frequency signals flexibly, and ii) extract first- and higher-order graph spectral information adaptively. In doing so, GrokFormer can effectively capture intricate patterns hidden across different orders and levels of frequency signals, learning expressive, order-and-frequency-adaptive graph representations. Comprehensive experiments conducted on 10 node classification datasets across various domains, scales, and levels of graph heterophily, as well as 5 graph classification datasets, demonstrate that GrokFormer outperforms state-of-the-art GTs and other advanced graph neural networks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17296v1-abstract-full').style.display = 'none'; document.getElementById('2411.17296v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 6 figures, 7tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.16779">arXiv:2411.16779</a> <span> [<a href="https://arxiv.org/pdf/2411.16779">pdf</a>, <a href="https://arxiv.org/format/2411.16779">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> NovelGS: Consistent Novel-view Denoising via Large Gaussian Reconstruction Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jinpeng Liu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jiale Xu</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+W">Weihao Cheng</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yiming Gao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xintao Wang</a>, <a href="/search/cs?searchtype=author&query=Shan%2C+Y">Ying Shan</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Y">Yansong Tang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.16779v1-abstract-short" style="display: inline;"> We introduce NovelGS, a diffusion model for Gaussian Splatting (GS) given sparse-view images. Recent works leverage feed-forward networks to generate pixel-aligned Gaussians, which could be fast rendered. Unfortunately, the method was unable to produce satisfactory results for areas not covered by the input images due to the formulation of these methods. In contrast, we leverage the novel view den… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16779v1-abstract-full').style.display = 'inline'; document.getElementById('2411.16779v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.16779v1-abstract-full" style="display: none;"> We introduce NovelGS, a diffusion model for Gaussian Splatting (GS) given sparse-view images. Recent works leverage feed-forward networks to generate pixel-aligned Gaussians, which could be fast rendered. Unfortunately, the method was unable to produce satisfactory results for areas not covered by the input images due to the formulation of these methods. In contrast, we leverage the novel view denoising through a transformer-based network to generate 3D Gaussians. Specifically, by incorporating both conditional views and noisy target views, the network predicts pixel-aligned Gaussians for each view. During training, the rendered target and some additional views of the Gaussians are supervised. During inference, the target views are iteratively rendered and denoised from pure noise. Our approach demonstrates state-of-the-art performance in addressing the multi-view image reconstruction challenge. Due to generative modeling of unseen regions, NovelGS effectively reconstructs 3D objects with consistent and sharp textures. Experimental results on publicly available datasets indicate that NovelGS substantially surpasses existing image-to-3D frameworks, both qualitatively and quantitatively. We also demonstrate the potential of NovelGS in generative tasks, such as text-to-3D and image-to-3D, by integrating it with existing multiview diffusion models. We will make the code publicly accessible. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16779v1-abstract-full').style.display = 'none'; document.getElementById('2411.16779v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15839">arXiv:2411.15839</a> <span> [<a href="https://arxiv.org/pdf/2411.15839">pdf</a>, <a href="https://arxiv.org/format/2411.15839">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> VaLiD: Mitigating the Hallucination of Large Vision Language Models by Visual Layer Fusion Contrastive Decoding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jiaqi Wang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yifei Gao</a>, <a href="/search/cs?searchtype=author&query=Sang%2C+J">Jitao Sang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15839v1-abstract-short" style="display: inline;"> Large Vision-Language Models (LVLMs) have demonstrated outstanding performance in multimodal task reasoning. However, they often generate responses that appear plausible yet do not accurately reflect the visual content, a phenomenon known as hallucination. Recent approaches have introduced training-free methods that mitigate hallucinations by adjusting the decoding strategy during inference stage,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15839v1-abstract-full').style.display = 'inline'; document.getElementById('2411.15839v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15839v1-abstract-full" style="display: none;"> Large Vision-Language Models (LVLMs) have demonstrated outstanding performance in multimodal task reasoning. However, they often generate responses that appear plausible yet do not accurately reflect the visual content, a phenomenon known as hallucination. Recent approaches have introduced training-free methods that mitigate hallucinations by adjusting the decoding strategy during inference stage, typically attributing hallucination to the language model itself. Our analysis, however, reveals that distortions in the visual encoding process significantly affect the model's reasoning accuracy. Specifically, earlier visual layers may retain key features but gradually distort as the information propagates toward the output layer. Building on these findings, we propose a novel hallucination-mitigation method from the visual encoding perspective: \textbf{V}isu\textbf{a}l \textbf{L}ayer Fus\textbf{i}on Contrastive \textbf{D}ecoding (VaLiD). This method utilizes uncertainty to guide the selection of visual hidden layers, correcting distortions in the visual encoding process and thereby improving the reliability of generated text. Experimental results show that VaLiD effectively reduces hallucinations across various benchmarks, achieving state-of-the-art performance compared to multiple baseline methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15839v1-abstract-full').style.display = 'none'; document.getElementById('2411.15839v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15766">arXiv:2411.15766</a> <span> [<a href="https://arxiv.org/pdf/2411.15766">pdf</a>, <a href="https://arxiv.org/format/2411.15766">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> ScalingNote: Scaling up Retrievers with Large Language Models for Real-World Dense Retrieval </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+S">Suyuan Huang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chao Zhang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yuanyuan Wu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Haoxin Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yuan Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+M">Maolin Wang</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+S">Shaosheng Cao</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+T">Tong Xu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+X">Xiangyu Zhao</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+Z">Zengchang Qin</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yan Gao</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+Y">Yunhan Bai</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+J">Jun Fan</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yao Hu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+E">Enhong Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15766v1-abstract-short" style="display: inline;"> Dense retrieval in most industries employs dual-tower architectures to retrieve query-relevant documents. Due to online deployment requirements, existing real-world dense retrieval systems mainly enhance performance by designing negative sampling strategies, overlooking the advantages of scaling up. Recently, Large Language Models (LLMs) have exhibited superior performance that can be leveraged fo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15766v1-abstract-full').style.display = 'inline'; document.getElementById('2411.15766v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15766v1-abstract-full" style="display: none;"> Dense retrieval in most industries employs dual-tower architectures to retrieve query-relevant documents. Due to online deployment requirements, existing real-world dense retrieval systems mainly enhance performance by designing negative sampling strategies, overlooking the advantages of scaling up. Recently, Large Language Models (LLMs) have exhibited superior performance that can be leveraged for scaling up dense retrieval. However, scaling up retrieval models significantly increases online query latency. To address this challenge, we propose ScalingNote, a two-stage method to exploit the scaling potential of LLMs for retrieval while maintaining online query latency. The first stage is training dual towers, both initialized from the same LLM, to unlock the potential of LLMs for dense retrieval. Then, we distill only the query tower using mean squared error loss and cosine similarity to reduce online costs. Through theoretical analysis and comprehensive offline and online experiments, we show the effectiveness and efficiency of ScalingNote. Our two-stage scaling method outperforms end-to-end models and verifies the scaling law of dense retrieval with LLMs in industrial scenarios, enabling cost-effective scaling of dense retrieval systems. Our online method incorporating ScalingNote significantly enhances the relevance between retrieved documents and queries. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15766v1-abstract-full').style.display = 'none'; document.getElementById('2411.15766v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15758">arXiv:2411.15758</a> <span> [<a href="https://arxiv.org/pdf/2411.15758">pdf</a>, <a href="https://arxiv.org/format/2411.15758">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3664647.3681705">10.1145/3664647.3681705 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Decoding Urban Industrial Complexity: Enhancing Knowledge-Driven Insights via IndustryScopeGPT </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+S">Siqi Wang</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+C">Chao Liang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yunfan Gao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yang Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jing Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haofen Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15758v1-abstract-short" style="display: inline;"> Industrial parks are critical to urban economic growth. Yet, their development often encounters challenges stemming from imbalances between industrial requirements and urban services, underscoring the need for strategic planning and operations. This paper introduces IndustryScopeKG, a pioneering large-scale multi-modal, multi-level industrial park knowledge graph, which integrates diverse urban da… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15758v1-abstract-full').style.display = 'inline'; document.getElementById('2411.15758v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15758v1-abstract-full" style="display: none;"> Industrial parks are critical to urban economic growth. Yet, their development often encounters challenges stemming from imbalances between industrial requirements and urban services, underscoring the need for strategic planning and operations. This paper introduces IndustryScopeKG, a pioneering large-scale multi-modal, multi-level industrial park knowledge graph, which integrates diverse urban data including street views, corporate, socio-economic, and geospatial information, capturing the complex relationships and semantics within industrial parks. Alongside this, we present the IndustryScopeGPT framework, which leverages Large Language Models (LLMs) with Monte Carlo Tree Search to enhance tool-augmented reasoning and decision-making in Industrial Park Planning and Operation (IPPO). Our work significantly improves site recommendation and functional planning, demonstrating the potential of combining LLMs with structured datasets to advance industrial park management. This approach sets a new benchmark for intelligent IPPO research and lays a robust foundation for advancing urban industrial development. The dataset and related code are available at https://github.com/Tongji-KGLLM/IndustryScope. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15758v1-abstract-full').style.display = 'none'; document.getElementById('2411.15758v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages, 6 figures, the 32nd ACM International Conference on Multimedia</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.0; I.2.7; H.3.3; H.4.0 </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> In Proceedings of the 32nd ACM International Conference on Multimedia, pp. 4757-4765 (2024, October) </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14967">arXiv:2411.14967</a> <span> [<a href="https://arxiv.org/pdf/2411.14967">pdf</a>, <a href="https://arxiv.org/format/2411.14967">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> SwissADT: An Audio Description Translation System for Swiss Languages </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fischer%2C+L">Lukas Fischer</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yingqiang Gao</a>, <a href="/search/cs?searchtype=author&query=Lintner%2C+A">Alexa Lintner</a>, <a href="/search/cs?searchtype=author&query=Ebling%2C+S">Sarah Ebling</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14967v1-abstract-short" style="display: inline;"> Audio description (AD) is a crucial accessibility service provided to blind persons and persons with visual impairment, designed to convey visual information in acoustic form. Despite recent advancements in multilingual machine translation research, the lack of well-crafted and time-synchronized AD data impedes the development of audio description translation (ADT) systems that address the needs o… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14967v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14967v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14967v1-abstract-full" style="display: none;"> Audio description (AD) is a crucial accessibility service provided to blind persons and persons with visual impairment, designed to convey visual information in acoustic form. Despite recent advancements in multilingual machine translation research, the lack of well-crafted and time-synchronized AD data impedes the development of audio description translation (ADT) systems that address the needs of multilingual countries such as Switzerland. Furthermore, since the majority of ADT systems rely solely on text, uncertainty exists as to whether incorporating visual information from the corresponding video clips can enhance the quality of ADT outputs. In this work, we present SwissADT, the first ADT system implemented for three main Swiss languages and English. By collecting well-crafted AD data augmented with video clips in German, French, Italian, and English, and leveraging the power of Large Language Models (LLMs), we aim to enhance information accessibility for diverse language populations in Switzerland by automatically translating AD scripts to the desired Swiss language. Our extensive experimental ADT results, composed of both automatic and human evaluations of ADT quality, demonstrate the promising capability of SwissADT for the ADT task. We believe that combining human expertise with the generation power of LLMs can further enhance the performance of ADT systems, ultimately benefiting a larger multilingual target population. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14967v1-abstract-full').style.display = 'none'; document.getElementById('2411.14967v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14865">arXiv:2411.14865</a> <span> [<a href="https://arxiv.org/pdf/2411.14865">pdf</a>, <a href="https://arxiv.org/format/2411.14865">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Benchmarking the Robustness of Optical Flow Estimation to Corruptions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yi%2C+Z">Zhonghua Yi</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+H">Hao Shi</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Q">Qi Jiang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yao Gao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Ze Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yufan Zhang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+K">Kailun Yang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+K">Kaiwei Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14865v1-abstract-short" style="display: inline;"> Optical flow estimation is extensively used in autonomous driving and video editing. While existing models demonstrate state-of-the-art performance across various benchmarks, the robustness of these methods has been infrequently investigated. Despite some research focusing on the robustness of optical flow models against adversarial attacks, there has been a lack of studies investigating their rob… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14865v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14865v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14865v1-abstract-full" style="display: none;"> Optical flow estimation is extensively used in autonomous driving and video editing. While existing models demonstrate state-of-the-art performance across various benchmarks, the robustness of these methods has been infrequently investigated. Despite some research focusing on the robustness of optical flow models against adversarial attacks, there has been a lack of studies investigating their robustness to common corruptions. Taking into account the unique temporal characteristics of optical flow, we introduce 7 temporal corruptions specifically designed for benchmarking the robustness of optical flow models, in addition to 17 classical single-image corruptions, in which advanced PSF Blur simulation method is performed. Two robustness benchmarks, KITTI-FC and GoPro-FC, are subsequently established as the first corruption robustness benchmark for optical flow estimation, with Out-Of-Domain (OOD) and In-Domain (ID) settings to facilitate comprehensive studies. Robustness metrics, Corruption Robustness Error (CRE), Corruption Robustness Error ratio (CREr), and Relative Corruption Robustness Error (RCRE) are further introduced to quantify the optical flow estimation robustness. 29 model variants from 15 optical flow methods are evaluated, yielding 10 intriguing observations, such as 1) the absolute robustness of the model is heavily dependent on the estimation performance; 2) the corruptions that diminish local information are more serious than that reduce visual effects. We also give suggestions for the design and application of optical flow models. We anticipate that our benchmark will serve as a foundational resource for advancing research in robust optical flow estimation. The benchmarks and source code will be released at https://github.com/ZhonghuaYi/optical_flow_robustness_benchmark. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14865v1-abstract-full').style.display = 'none'; document.getElementById('2411.14865v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The benchmarks and source code will be released at https://github.com/ZhonghuaYi/optical_flow_robustness_benchmark</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14347">arXiv:2411.14347</a> <span> [<a href="https://arxiv.org/pdf/2411.14347">pdf</a>, <a href="https://arxiv.org/format/2411.14347">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> DINO-X: A Unified Vision Model for Open-World Object Detection and Understanding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ren%2C+T">Tianhe Ren</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yihao Chen</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Q">Qing Jiang</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+Z">Zhaoyang Zeng</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+Y">Yuda Xiong</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+W">Wenlong Liu</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Z">Zhengyu Ma</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+J">Junyi Shen</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yuan Gao</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+X">Xiaoke Jiang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xingyu Chen</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Z">Zhuheng Song</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yuhong Zhang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+H">Hongjie Huang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+H">Han Gao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Shilong Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hao Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+F">Feng Li</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+K">Kent Yu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Lei Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14347v1-abstract-short" style="display: inline;"> In this paper, we introduce DINO-X, which is a unified object-centric vision model developed by IDEA Research with the best open-world object detection performance to date. DINO-X employs the same Transformer-based encoder-decoder architecture as Grounding DINO 1.5 to pursue an object-level representation for open-world object understanding. To make long-tailed object detection easy, DINO-X extend… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14347v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14347v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14347v1-abstract-full" style="display: none;"> In this paper, we introduce DINO-X, which is a unified object-centric vision model developed by IDEA Research with the best open-world object detection performance to date. DINO-X employs the same Transformer-based encoder-decoder architecture as Grounding DINO 1.5 to pursue an object-level representation for open-world object understanding. To make long-tailed object detection easy, DINO-X extends its input options to support text prompt, visual prompt, and customized prompt. With such flexible prompt options, we develop a universal object prompt to support prompt-free open-world detection, making it possible to detect anything in an image without requiring users to provide any prompt. To enhance the model's core grounding capability, we have constructed a large-scale dataset with over 100 million high-quality grounding samples, referred to as Grounding-100M, for advancing the model's open-vocabulary detection performance. Pre-training on such a large-scale grounding dataset leads to a foundational object-level representation, which enables DINO-X to integrate multiple perception heads to simultaneously support multiple object perception and understanding tasks, including detection, segmentation, pose estimation, object captioning, object-based QA, etc. Experimental results demonstrate the superior performance of DINO-X. Specifically, the DINO-X Pro model achieves 56.0 AP, 59.8 AP, and 52.4 AP on the COCO, LVIS-minival, and LVIS-val zero-shot object detection benchmarks, respectively. Notably, it scores 63.3 AP and 56.5 AP on the rare classes of LVIS-minival and LVIS-val benchmarks, both improving the previous SOTA performance by 5.8 AP. Such a result underscores its significantly improved capacity for recognizing long-tailed objects. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14347v1-abstract-full').style.display = 'none'; document.getElementById('2411.14347v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Technical Report</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13909">arXiv:2411.13909</a> <span> [<a href="https://arxiv.org/pdf/2411.13909">pdf</a>, <a href="https://arxiv.org/format/2411.13909">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Panther: Illuminate the Sight of Multimodal LLMs with Instruction-Guided Visual Prompts </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+H">Honglin Li</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yuting Gao</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+C">Chenglu Zhu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jingdong Chen</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+M">Ming Yang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+L">Lin Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13909v2-abstract-short" style="display: inline;"> Multimodal large language models (MLLMs) are closing the gap to human visual perception capability rapidly, while, still lag behind on attending to subtle images details or locating small objects precisely, etc. Common schemes to tackle these issues include deploying multiple vision encoders or operating on original high-resolution images. Few studies have concentrated on taking the textual instru… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13909v2-abstract-full').style.display = 'inline'; document.getElementById('2411.13909v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13909v2-abstract-full" style="display: none;"> Multimodal large language models (MLLMs) are closing the gap to human visual perception capability rapidly, while, still lag behind on attending to subtle images details or locating small objects precisely, etc. Common schemes to tackle these issues include deploying multiple vision encoders or operating on original high-resolution images. Few studies have concentrated on taking the textual instruction into improving visual representation, resulting in losing focus in some vision-centric tasks, a phenomenon we herein termed as Amblyopia. In this work, we introduce Panther, a MLLM that closely adheres to user instruction and locates targets of interests precisely, with the finesse of a black panther. Specifically, Panther comprises three integral components: Panther-VE, Panther-Bridge, and Panther-Decoder. Panther-VE integrates user instruction information at the early stages of the vision encoder, thereby extracting the most relevant and useful visual representations. The Panther-Bridge module, equipped with powerful filtering capabilities, significantly reduces redundant visual information, leading to a substantial savings in training costs. The Panther-Decoder is versatile and can be employed with any decoder-only architecture of LLMs without discrimination. Experimental results, particularly on vision-centric benchmarks, have demonstrated the effectiveness of Panther. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13909v2-abstract-full').style.display = 'none'; document.getElementById('2411.13909v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13785">arXiv:2411.13785</a> <span> [<a href="https://arxiv.org/pdf/2411.13785">pdf</a>, <a href="https://arxiv.org/ps/2411.13785">ps</a>, <a href="https://arxiv.org/format/2411.13785">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Throughput Maximization for Movable Antenna Systems with Movement Delay Consideration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+H">Honghao Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Q">Qingqing Wu</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Ying Gao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wen Chen</a>, <a href="/search/cs?searchtype=author&query=Mei%2C+W">Weidong Mei</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+G">Guojie Hu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+L">Lexi Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13785v1-abstract-short" style="display: inline;"> In this paper, we model the minimum achievable throughput within a transmission block of restricted duration and aim to maximize it in movable antenna (MA)-enabled multiuser downlink communications. Particularly, we account for the antenna moving delay caused by mechanical movement, which has not been fully considered in previous studies, and reveal the trade-off between the delay and signal-to-in… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13785v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13785v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13785v1-abstract-full" style="display: none;"> In this paper, we model the minimum achievable throughput within a transmission block of restricted duration and aim to maximize it in movable antenna (MA)-enabled multiuser downlink communications. Particularly, we account for the antenna moving delay caused by mechanical movement, which has not been fully considered in previous studies, and reveal the trade-off between the delay and signal-to-interference-plus-noise ratio at users. To this end, we first consider a single-user setup to analyze the necessity of antenna movement. By quantizing the virtual angles of arrival, we derive the requisite region size for antenna moving, design the initial MA position, and elucidate the relationship between quantization resolution and moving region size. Furthermore, an efficient algorithm is developed to optimize MA position via successive convex approximation, which is subsequently extended to the general multiuser setup. Numerical results demonstrate that the proposed algorithms outperform fixed-position antenna schemes and existing ones without consideration of movement delay. Additionally, our algorithms exhibit excellent adaptability and stability across various transmission block durations and moving region sizes, and are robust to different antenna moving speeds. This allows the hardware cost of MA-aided systems to be reduced by employing low rotational speed motors. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13785v1-abstract-full').style.display = 'none'; document.getElementById('2411.13785v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13560">arXiv:2411.13560</a> <span> [<a href="https://arxiv.org/pdf/2411.13560">pdf</a>, <a href="https://arxiv.org/format/2411.13560">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Emerging Technologies">cs.ET</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> AMSnet-KG: A Netlist Dataset for LLM-based AMS Circuit Auto-Design Using Knowledge Graph RAG </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yichen Shi</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+Z">Zhuofu Tao</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yuhao Gao</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+T">Tianjia Zhou</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+C">Cheng Chang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yaxing Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+B">Bingyu Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+G">Genhao Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+A">Alvin Liu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Z">Zhiping Yu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+T">Ting-Jung Lin</a>, <a href="/search/cs?searchtype=author&query=He%2C+L">Lei He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13560v1-abstract-short" style="display: inline;"> High-performance analog and mixed-signal (AMS) circuits are mainly full-custom designed, which is time-consuming and labor-intensive. A significant portion of the effort is experience-driven, which makes the automation of AMS circuit design a formidable challenge. Large language models (LLMs) have emerged as powerful tools for Electronic Design Automation (EDA) applications, fostering advancements… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13560v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13560v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13560v1-abstract-full" style="display: none;"> High-performance analog and mixed-signal (AMS) circuits are mainly full-custom designed, which is time-consuming and labor-intensive. A significant portion of the effort is experience-driven, which makes the automation of AMS circuit design a formidable challenge. Large language models (LLMs) have emerged as powerful tools for Electronic Design Automation (EDA) applications, fostering advancements in the automatic design process for large-scale AMS circuits. However, the absence of high-quality datasets has led to issues such as model hallucination, which undermines the robustness of automatically generated circuit designs. To address this issue, this paper introduces AMSnet-KG, a dataset encompassing various AMS circuit schematics and netlists. We construct a knowledge graph with annotations on detailed functional and performance characteristics. Facilitated by AMSnet-KG, we propose an automated AMS circuit generation framework that utilizes the comprehensive knowledge embedded in LLMs. We first formulate a design strategy (e.g., circuit architecture using a number of circuit components) based on required specifications. Next, matched circuit components are retrieved and assembled into a complete topology, and transistor sizing is obtained through Bayesian optimization. Simulation results of the netlist are fed back to the LLM for further topology refinement, ensuring the circuit design specifications are met. We perform case studies of operational amplifier and comparator design to verify the automatic design flow from specifications to netlists with minimal human effort. The dataset used in this paper will be open-sourced upon publishing of this paper. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13560v1-abstract-full').style.display = 'none'; document.getElementById('2411.13560v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13069">arXiv:2411.13069</a> <span> [<a href="https://arxiv.org/pdf/2411.13069">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Automatic marker-free registration based on similar tetrahedras for single-tree point clouds </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ren%2C+J">Jing Ren</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+P">Pei Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Hanlong Li</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yuhan Wu</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yuhang Gao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wenxin Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Mingtai Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Lingyun Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13069v1-abstract-short" style="display: inline;"> In recent years, terrestrial laser scanning technology has been widely used to collect tree point cloud data, aiding in measurements of diameter at breast height, biomass, and other forestry survey data. Since a single scan from terrestrial laser systems captures data from only one angle, multiple scans must be registered and fused to obtain complete tree point cloud data. This paper proposes a ma… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13069v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13069v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13069v1-abstract-full" style="display: none;"> In recent years, terrestrial laser scanning technology has been widely used to collect tree point cloud data, aiding in measurements of diameter at breast height, biomass, and other forestry survey data. Since a single scan from terrestrial laser systems captures data from only one angle, multiple scans must be registered and fused to obtain complete tree point cloud data. This paper proposes a marker-free automatic registration method for single-tree point clouds based on similar tetrahedras. First, two point clouds from two scans of the same tree are used to generate tree skeletons, and key point sets are constructed from these skeletons. Tetrahedra are then filtered and matched according to similarity principles, with the vertices of these two matched tetrahedras selected as matching point pairs, thus completing the coarse registration of the point clouds from the two scans. Subsequently, the ICP method is applied to the coarse-registered leaf point clouds to obtain fine registration parameters, completing the precise registration of the two tree point clouds. Experiments were conducted using terrestrial laser scanning data from eight trees, each from different species and with varying shapes. The proposed method was evaluated using RMSE and Hausdorff distance, compared against the traditional ICP and NDT methods. The experimental results demonstrate that the proposed method significantly outperforms both ICP and NDT in registration accuracy, achieving speeds up to 593 times and 113 times faster than ICP and NDT, respectively. In summary, the proposed method shows good robustness in single-tree point cloud registration, with significant advantages in accuracy and speed compared to traditional ICP and NDT methods, indicating excellent application prospects in practical registration scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13069v1-abstract-full').style.display = 'none'; document.getElementById('2411.13069v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">remote sensing; terrestrial lidar; multi-scan cloud registration</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12781">arXiv:2411.12781</a> <span> [<a href="https://arxiv.org/pdf/2411.12781">pdf</a>, <a href="https://arxiv.org/format/2411.12781">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> FGP: Feature-Gradient-Prune for Efficient Convolutional Layer Pruning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lv%2C+Q">Qingsong Lv</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+J">Jiasheng Sun</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+S">Sheng Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xu Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Liangcheng Li</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yun Gao</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+S">Sun Qiao</a>, <a href="/search/cs?searchtype=author&query=Song%2C+J">Jie Song</a>, <a href="/search/cs?searchtype=author&query=Bu%2C+J">Jiajun Bu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12781v1-abstract-short" style="display: inline;"> To reduce computational overhead while maintaining model performance, model pruning techniques have been proposed. Among these, structured pruning, which removes entire convolutional channels or layers, significantly enhances computational efficiency and is compatible with hardware acceleration. However, existing pruning methods that rely solely on image features or gradients often result in the r… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12781v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12781v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12781v1-abstract-full" style="display: none;"> To reduce computational overhead while maintaining model performance, model pruning techniques have been proposed. Among these, structured pruning, which removes entire convolutional channels or layers, significantly enhances computational efficiency and is compatible with hardware acceleration. However, existing pruning methods that rely solely on image features or gradients often result in the retention of redundant channels, negatively impacting inference efficiency. To address this issue, this paper introduces a novel pruning method called Feature-Gradient Pruning (FGP). This approach integrates both feature-based and gradient-based information to more effectively evaluate the importance of channels across various target classes, enabling a more accurate identification of channels that are critical to model performance. Experimental results demonstrate that the proposed method improves both model compactness and practicality while maintaining stable performance. Experiments conducted across multiple tasks and datasets show that FGP significantly reduces computational costs and minimizes accuracy loss compared to existing methods, highlighting its effectiveness in optimizing pruning outcomes. The source code is available at: https://github.com/FGP-code/FGP. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12781v1-abstract-full').style.display = 'none'; document.getElementById('2411.12781v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12746">arXiv:2411.12746</a> <span> [<a href="https://arxiv.org/pdf/2411.12746">pdf</a>, <a href="https://arxiv.org/format/2411.12746">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computational Finance">q-fin.CP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> A Review of Reinforcement Learning in Financial Applications </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bai%2C+Y">Yahui Bai</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yuhe Gao</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+R">Runzhe Wan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Sheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Song%2C+R">Rui Song</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12746v1-abstract-short" style="display: inline;"> In recent years, there has been a growing trend of applying Reinforcement Learning (RL) in financial applications. This approach has shown great potential to solve decision-making tasks in finance. In this survey, we present a comprehensive study of the applications of RL in finance and conduct a series of meta-analyses to investigate the common themes in the literature, such as the factors th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12746v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12746v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12746v1-abstract-full" style="display: none;"> In recent years, there has been a growing trend of applying Reinforcement Learning (RL) in financial applications. This approach has shown great potential to solve decision-making tasks in finance. In this survey, we present a comprehensive study of the applications of RL in finance and conduct a series of meta-analyses to investigate the common themes in the literature, such as the factors that most significantly affect RL's performance compared to traditional methods. Moreover, we identify challenges including explainability, Markov Decision Process (MDP) modeling, and robustness that hinder the broader utilization of RL in the financial industry and discuss recent advancements in overcoming these challenges. Finally, we propose future research directions, such as benchmarking, contextual RL, multi-agent RL, and model-based RL to address these challenges and to further enhance the implementation of RL in finance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12746v1-abstract-full').style.display = 'none'; document.getElementById('2411.12746v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12309">arXiv:2411.12309</a> <span> [<a href="https://arxiv.org/pdf/2411.12309">pdf</a>, <a href="https://arxiv.org/format/2411.12309">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> DGTR: Distributed Gaussian Turbo-Reconstruction for Sparse-View Vast Scenes </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+H">Hao Li</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yuanyuan Gao</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+H">Haosong Peng</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+C">Chenming Wu</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+W">Weicai Ye</a>, <a href="/search/cs?searchtype=author&query=Zhan%2C+Y">Yufeng Zhan</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+C">Chen Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+D">Dingwen Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jingdong Wang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+J">Junwei Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12309v2-abstract-short" style="display: inline;"> Novel-view synthesis (NVS) approaches play a critical role in vast scene reconstruction. However, these methods rely heavily on dense image inputs and prolonged training times, making them unsuitable where computational resources are limited. Additionally, few-shot methods often struggle with poor reconstruction quality in vast environments. This paper presents DGTR, a novel distributed framework… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12309v2-abstract-full').style.display = 'inline'; document.getElementById('2411.12309v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12309v2-abstract-full" style="display: none;"> Novel-view synthesis (NVS) approaches play a critical role in vast scene reconstruction. However, these methods rely heavily on dense image inputs and prolonged training times, making them unsuitable where computational resources are limited. Additionally, few-shot methods often struggle with poor reconstruction quality in vast environments. This paper presents DGTR, a novel distributed framework for efficient Gaussian reconstruction for sparse-view vast scenes. Our approach divides the scene into regions, processed independently by drones with sparse image inputs. Using a feed-forward Gaussian model, we predict high-quality Gaussian primitives, followed by a global alignment algorithm to ensure geometric consistency. Synthetic views and depth priors are incorporated to further enhance training, while a distillation-based model aggregation mechanism enables efficient reconstruction. Our method achieves high-quality large-scale scene reconstruction and novel-view synthesis in significantly reduced training times, outperforming existing approaches in both speed and scalability. We demonstrate the effectiveness of our framework on vast aerial scenes, achieving high-quality results within minutes. Code will released on our [https://3d-aigc.github.io/DGTR]. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12309v2-abstract-full').style.display = 'none'; document.getElementById('2411.12309v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Code will released on our [https://3d-aigc.github.io/DGTR]</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11933">arXiv:2411.11933</a> <span> [<a href="https://arxiv.org/pdf/2411.11933">pdf</a>, <a href="https://arxiv.org/format/2411.11933">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> METEOR: Evolutionary Journey of Large Language Models from Guidance to Self-Growth </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+J">Jiawei Li</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+C">Chong Feng</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yang Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11933v1-abstract-short" style="display: inline;"> Model evolution enables learning from feedback to refine experiences and update skills, transforming models from having no domain knowledge to becoming domain experts. However, there is currently no unified and effective method for guiding this evolutionary process. To address this gap, we propose the Meteor method, which includes three training phases: weak-to-strong data distillation, iterative… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11933v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11933v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11933v1-abstract-full" style="display: none;"> Model evolution enables learning from feedback to refine experiences and update skills, transforming models from having no domain knowledge to becoming domain experts. However, there is currently no unified and effective method for guiding this evolutionary process. To address this gap, we propose the Meteor method, which includes three training phases: weak-to-strong data distillation, iterative training, and self-evolution strategies. Each phase maximizes the model's inherent domain capabilities, allowing it to autonomously refine its domain knowledge and enhance performance. Experiments demonstrate that our approach significantly improves accuracy, completeness, relevance, coherence, and reliability across domain-specific tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11933v1-abstract-full').style.display = 'none'; document.getElementById('2411.11933v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11932">arXiv:2411.11932</a> <span> [<a href="https://arxiv.org/pdf/2411.11932">pdf</a>, <a href="https://arxiv.org/format/2411.11932">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Reviving Dormant Memories: Investigating Catastrophic Forgetting in Language Models through Rationale-Guidance Difficulty </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+H">Huashan Sun</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yang Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11932v1-abstract-short" style="display: inline;"> Although substantial efforts have been made to mitigate catastrophic forgetting in continual learning, the intrinsic mechanisms are not well understood. In this paper, we discover that when a forgetting model passively receives an externally provided partial appropriate rationale, its performance on the forgotten task can be restored. Furthermore, by simply adding a task-agnostic prefix to the ori… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11932v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11932v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11932v1-abstract-full" style="display: none;"> Although substantial efforts have been made to mitigate catastrophic forgetting in continual learning, the intrinsic mechanisms are not well understood. In this paper, we discover that when a forgetting model passively receives an externally provided partial appropriate rationale, its performance on the forgotten task can be restored. Furthermore, by simply adding a task-agnostic prefix to the original instruction, the forgetting model can actively generate an appropriate rationale to reach the correct answer. These findings suggest that the model does not actually ``forget'' the task knowledge; instead, the degraded performance can be attributed to the failure of the original instructions in guiding the model to generate the appropriate rationales. Based on this insight, we propose the Rationale-Guidance Difficulty metric to evaluate how effectively a given instruction guides the model in generating appropriate rationales. We apply this metric to optimize the allocation of replay data in replay-based continual learning algorithm. Experimental results demonstrate that our data allocation method effectively mitigates catastrophic forgetting and maintains better model plasticity simultaneously across models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11932v1-abstract-full').style.display = 'none'; document.getElementById('2411.11932v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Working in progress</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11681">arXiv:2411.11681</a> <span> [<a href="https://arxiv.org/pdf/2411.11681">pdf</a>, <a href="https://arxiv.org/format/2411.11681">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> PSPO*: An Effective Process-supervised Policy Optimization for Reasoning Alignment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+J">Jiawei Li</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+X">Xinyue Liang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yizhe Yang</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+C">Chong Feng</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yang Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11681v2-abstract-short" style="display: inline;"> Process supervision enhances the performance of large language models in reasoning tasks by providing feedback at each step of chain-of-thought reasoning. However, due to the lack of effective process supervision methods, even advanced large language models are prone to logical errors and redundant reasoning. We claim that the effectiveness of process supervision significantly depends on both the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11681v2-abstract-full').style.display = 'inline'; document.getElementById('2411.11681v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11681v2-abstract-full" style="display: none;"> Process supervision enhances the performance of large language models in reasoning tasks by providing feedback at each step of chain-of-thought reasoning. However, due to the lack of effective process supervision methods, even advanced large language models are prone to logical errors and redundant reasoning. We claim that the effectiveness of process supervision significantly depends on both the accuracy and the length of reasoning chains. Moreover, we identify that these factors exhibit a nonlinear relationship with the overall reward score of the reasoning process. Inspired by these insights, we propose a novel process supervision paradigm, PSPO*, which systematically outlines the workflow from reward model training to policy optimization, and highlights the importance of nonlinear rewards in process supervision. Based on PSPO*, we develop the PSPO-WRS, which considers the number of reasoning steps in determining reward scores and utilizes an adjusted Weibull distribution for nonlinear reward shaping. Experimental results on six mathematical reasoning datasets demonstrate that PSPO-WRS consistently outperforms current mainstream models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11681v2-abstract-full').style.display = 'none'; document.getElementById('2411.11681v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Our code can be found at https://github.com/DIRECT-BIT/PSPO</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11053">arXiv:2411.11053</a> <span> [<a href="https://arxiv.org/pdf/2411.11053">pdf</a>, <a href="https://arxiv.org/format/2411.11053">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> SRA-MCTS: Self-driven Reasoning Augmentation with Monte Carlo Tree Search for Code Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+B">Bin Xu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yiguan Lin</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yinghao Li</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yang Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11053v4-abstract-short" style="display: inline;"> Large language models demonstrate exceptional performance in simple code generation tasks but still face challenges in tackling complex problems. These challenges may stem from insufficient reasoning and problem decomposition capabilities. To address this issue, we propose a reasoning-augmented data generation process, SRA-MCTS, which guides the model to autonomously generate high-quality intermed… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11053v4-abstract-full').style.display = 'inline'; document.getElementById('2411.11053v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11053v4-abstract-full" style="display: none;"> Large language models demonstrate exceptional performance in simple code generation tasks but still face challenges in tackling complex problems. These challenges may stem from insufficient reasoning and problem decomposition capabilities. To address this issue, we propose a reasoning-augmented data generation process, SRA-MCTS, which guides the model to autonomously generate high-quality intermediate reasoning paths. This creates a positive feedback loop, enabling continuous improvement. Our method operates entirely through the model itself without requiring additional supervision. By synthesizing natural language reasoning paths and translating them into executable code, the approach ensures analytical accuracy and enhances the success rate in solving complex tasks. Experimental results show that, even without additional supervisory signals, our method achieves performance improvements across different model scales, demonstrating the significant potential of self-improvement in small models. Furthermore, the method remains robust when traditional Chain-of-Thought (CoT) approaches exhibit performance degradation, with notable improvements observed in diversity metrics such as pass@10. We encourage further exploration of reasoning processes within training data to enhance the ability of language models to address complex problems. Our code and data are public at https://github.com/DIRECT-BIT/SRA-MCTS. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11053v4-abstract-full').style.display = 'none'; document.getElementById('2411.11053v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10789">arXiv:2411.10789</a> <span> [<a href="https://arxiv.org/pdf/2411.10789">pdf</a>, <a href="https://arxiv.org/format/2411.10789">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Anatomy-Guided Radiology Report Generation with Pathology-Aware Regional Prompts </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yijian Gao</a>, <a href="/search/cs?searchtype=author&query=Marshall%2C+D">Dominic Marshall</a>, <a href="/search/cs?searchtype=author&query=Xing%2C+X">Xiaodan Xing</a>, <a href="/search/cs?searchtype=author&query=Ning%2C+J">Junzhi Ning</a>, <a href="/search/cs?searchtype=author&query=Papanastasiou%2C+G">Giorgos Papanastasiou</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+G">Guang Yang</a>, <a href="/search/cs?searchtype=author&query=Komorowski%2C+M">Matthieu Komorowski</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10789v1-abstract-short" style="display: inline;"> Radiology reporting generative AI holds significant potential to alleviate clinical workloads and streamline medical care. However, achieving high clinical accuracy is challenging, as radiological images often feature subtle lesions and intricate structures. Existing systems often fall short, largely due to their reliance on fixed size, patch-level image features and insufficient incorporation of… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10789v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10789v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10789v1-abstract-full" style="display: none;"> Radiology reporting generative AI holds significant potential to alleviate clinical workloads and streamline medical care. However, achieving high clinical accuracy is challenging, as radiological images often feature subtle lesions and intricate structures. Existing systems often fall short, largely due to their reliance on fixed size, patch-level image features and insufficient incorporation of pathological information. This can result in the neglect of such subtle patterns and inconsistent descriptions of crucial pathologies. To address these challenges, we propose an innovative approach that leverages pathology-aware regional prompts to explicitly integrate anatomical and pathological information of various scales, significantly enhancing the precision and clinical relevance of generated reports. We develop an anatomical region detector that extracts features from distinct anatomical areas, coupled with a novel multi-label lesion detector that identifies global pathologies. Our approach emulates the diagnostic process of radiologists, producing clinically accurate reports with comprehensive diagnostic capabilities. Experimental results show that our model outperforms previous state-of-the-art methods on most natural language generation and clinical efficacy metrics, with formal expert evaluations affirming its potential to enhance radiology practice. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10789v1-abstract-full').style.display = 'none'; document.getElementById('2411.10789v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10669">arXiv:2411.10669</a> <span> [<a href="https://arxiv.org/pdf/2411.10669">pdf</a>, <a href="https://arxiv.org/format/2411.10669">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Awaker2.5-VL: Stably Scaling MLLMs with Parameter-Efficient Mixture of Experts </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Long%2C+J">Jinqiang Long</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+Y">Yanqi Dai</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+G">Guoxing Yang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H">Hongpeng Lin</a>, <a href="/search/cs?searchtype=author&query=Fei%2C+N">Nanyi Fei</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yizhao Gao</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Z">Zhiwu Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10669v1-abstract-short" style="display: inline;"> As the research of Multimodal Large Language Models (MLLMs) becomes popular, an advancing MLLM model is typically required to handle various textual and visual tasks (e.g., VQA, Detection, OCR, and ChartQA) simultaneously for real-world applications. However, due to the significant differences in representation and distribution among data from various tasks, simply mixing data of all tasks togethe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10669v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10669v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10669v1-abstract-full" style="display: none;"> As the research of Multimodal Large Language Models (MLLMs) becomes popular, an advancing MLLM model is typically required to handle various textual and visual tasks (e.g., VQA, Detection, OCR, and ChartQA) simultaneously for real-world applications. However, due to the significant differences in representation and distribution among data from various tasks, simply mixing data of all tasks together leads to the well-known``multi-task conflict" issue, resulting in performance degradation across various tasks. To address this issue, we propose Awaker2.5-VL, a Mixture of Experts~(MoE) architecture suitable for MLLM, which acquires the multi-task capabilities through multiple sparsely activated experts. To speed up the training and inference of Awaker2.5-VL, each expert in our model is devised as a low-rank adaptation (LoRA) structure. Extensive experiments on multiple latest benchmarks demonstrate the effectiveness of Awaker2.5-VL. The code and model weight are released in our Project Page: https://github.com/MetabrainAGI/Awaker. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10669v1-abstract-full').style.display = 'none'; document.getElementById('2411.10669v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09928">arXiv:2411.09928</a> <span> [<a href="https://arxiv.org/pdf/2411.09928">pdf</a>, <a href="https://arxiv.org/format/2411.09928">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Is Precise Recovery Necessary? A Task-Oriented Imputation Approach for Time Series Forecasting on Variable Subset </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hao%2C+Q">Qi Hao</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+R">Runchang Liang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yue Gao</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+H">Hao Dong</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+W">Wei Fan</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+L">Lu Jiang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+P">Pengyang Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09928v1-abstract-short" style="display: inline;"> Variable Subset Forecasting (VSF) refers to a unique scenario in multivariate time series forecasting, where available variables in the inference phase are only a subset of the variables in the training phase. VSF presents significant challenges as the entire time series may be missing, and neither inter- nor intra-variable correlations persist. Such conditions impede the effectiveness of traditio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09928v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09928v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09928v1-abstract-full" style="display: none;"> Variable Subset Forecasting (VSF) refers to a unique scenario in multivariate time series forecasting, where available variables in the inference phase are only a subset of the variables in the training phase. VSF presents significant challenges as the entire time series may be missing, and neither inter- nor intra-variable correlations persist. Such conditions impede the effectiveness of traditional imputation methods, primarily focusing on filling in individual missing data points. Inspired by the principle of feature engineering that not all variables contribute positively to forecasting, we propose Task-Oriented Imputation for VSF (TOI-VSF), a novel framework shifts the focus from accurate data recovery to directly support the downstream forecasting task. TOI-VSF incorporates a self-supervised imputation module, agnostic to the forecasting model, designed to fill in missing variables while preserving the vital characteristics and temporal patterns of time series data. Additionally, we implement a joint learning strategy for imputation and forecasting, ensuring that the imputation process is directly aligned with and beneficial to the forecasting objective. Extensive experiments across four datasets demonstrate the superiority of TOI-VSF, outperforming baseline methods by $15\%$ on average. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09928v1-abstract-full').style.display = 'none'; document.getElementById('2411.09928v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09523">arXiv:2411.09523</a> <span> [<a href="https://arxiv.org/pdf/2411.09523">pdf</a>, <a href="https://arxiv.org/format/2411.09523">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Navigating the Risks: A Survey of Security, Privacy, and Ethics Threats in LLM-Based Agents </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gan%2C+Y">Yuyou Gan</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yong Yang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Z">Zhe Ma</a>, <a href="/search/cs?searchtype=author&query=He%2C+P">Ping He</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+R">Rui Zeng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yiming Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Q">Qingming Li</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+C">Chunyi Zhou</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Songze Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+T">Ting Wang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yunjun Gao</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yingcai Wu</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+S">Shouling Ji</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09523v1-abstract-short" style="display: inline;"> With the continuous development of large language models (LLMs), transformer-based models have made groundbreaking advances in numerous natural language processing (NLP) tasks, leading to the emergence of a series of agents that use LLMs as their control hub. While LLMs have achieved success in various tasks, they face numerous security and privacy threats, which become even more severe in the age… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09523v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09523v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09523v1-abstract-full" style="display: none;"> With the continuous development of large language models (LLMs), transformer-based models have made groundbreaking advances in numerous natural language processing (NLP) tasks, leading to the emergence of a series of agents that use LLMs as their control hub. While LLMs have achieved success in various tasks, they face numerous security and privacy threats, which become even more severe in the agent scenarios. To enhance the reliability of LLM-based applications, a range of research has emerged to assess and mitigate these risks from different perspectives. To help researchers gain a comprehensive understanding of various risks, this survey collects and analyzes the different threats faced by these agents. To address the challenges posed by previous taxonomies in handling cross-module and cross-stage threats, we propose a novel taxonomy framework based on the sources and impacts. Additionally, we identify six key features of LLM-based agents, based on which we summarize the current research progress and analyze their limitations. Subsequently, we select four representative agents as case studies to analyze the risks they may face in practical use. Finally, based on the aforementioned analyses, we propose future research directions from the perspectives of data, methodology, and policy, respectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09523v1-abstract-full').style.display = 'none'; document.getElementById('2411.09523v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09145">arXiv:2411.09145</a> <span> [<a href="https://arxiv.org/pdf/2411.09145">pdf</a>, <a href="https://arxiv.org/format/2411.09145">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> UniHOI: Learning Fast, Dense and Generalizable 4D Reconstruction for Egocentric Hand Object Interaction Videos </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yuan%2C+C">Chengbo Yuan</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+G">Geng Chen</a>, <a href="/search/cs?searchtype=author&query=Yi%2C+L">Li Yi</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yang Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09145v2-abstract-short" style="display: inline;"> Egocentric Hand Object Interaction (HOI) videos provide valuable insights into human interactions with the physical world, attracting growing interest from the computer vision and robotics communities. A key task in fully understanding the geometry and dynamics of HOI scenes is dense pointclouds sequence reconstruction. However, the inherent motion of both hands and the camera makes this challengi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09145v2-abstract-full').style.display = 'inline'; document.getElementById('2411.09145v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09145v2-abstract-full" style="display: none;"> Egocentric Hand Object Interaction (HOI) videos provide valuable insights into human interactions with the physical world, attracting growing interest from the computer vision and robotics communities. A key task in fully understanding the geometry and dynamics of HOI scenes is dense pointclouds sequence reconstruction. However, the inherent motion of both hands and the camera makes this challenging. Current methods often rely on time-consuming test-time optimization, making them impractical for reconstructing internet-scale videos. To address this, we introduce UniHOI, a model that unifies the estimation of all variables necessary for dense 4D reconstruction, including camera intrinsic, camera poses, and video depth, for egocentric HOI scene in a fast feed-forward manner. We end-to-end optimize all these variables to improve their consistency in 3D space. Furthermore, our model could be trained solely on large-scale monocular video dataset, overcoming the limitation of scarce labeled HOI data. We evaluate UniHOI with both in-domain and zero-shot generalization setting, surpassing all baselines in pointclouds sequence reconstruction and long-term 3D scene flow recovery. UniHOI is the first approach to offer fast, dense, and generalizable monocular egocentric HOI scene reconstruction in the presence of motion. Code and trained model will be released in the future. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09145v2-abstract-full').style.display = 'none'; document.getElementById('2411.09145v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.08599">arXiv:2411.08599</a> <span> [<a href="https://arxiv.org/pdf/2411.08599">pdf</a>, <a href="https://arxiv.org/format/2411.08599">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> XiYan-SQL: A Multi-Generator Ensemble Framework for Text-to-SQL </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yingqi Gao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yifu Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiaoxia Li</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+X">Xiaorong Shi</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yin Zhu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yiming Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shiqi Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wei Li</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+Y">Yuntao Hong</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+Z">Zhiling Luo</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+J">Jinyang Gao</a>, <a href="/search/cs?searchtype=author&query=Mou%2C+L">Liyu Mou</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yu Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.08599v1-abstract-short" style="display: inline;"> To tackle the challenges of large language model performance in natural language to SQL tasks, we introduce XiYan-SQL, an innovative framework that employs a multi-generator ensemble strategy to improve candidate generation. We introduce M-Schema, a semi-structured schema representation method designed to enhance the understanding of database structures. To enhance the quality and diversity of gen… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08599v1-abstract-full').style.display = 'inline'; document.getElementById('2411.08599v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.08599v1-abstract-full" style="display: none;"> To tackle the challenges of large language model performance in natural language to SQL tasks, we introduce XiYan-SQL, an innovative framework that employs a multi-generator ensemble strategy to improve candidate generation. We introduce M-Schema, a semi-structured schema representation method designed to enhance the understanding of database structures. To enhance the quality and diversity of generated candidate SQL queries, XiYan-SQL integrates the significant potential of in-context learning (ICL) with the precise control of supervised fine-tuning. On one hand, we propose a series of training strategies to fine-tune models to generate high-quality candidates with diverse preferences. On the other hand, we implement the ICL approach with an example selection method based on named entity recognition to prevent overemphasis on entities. The refiner optimizes each candidate by correcting logical or syntactical errors. To address the challenge of identifying the best candidate, we fine-tune a selection model to distinguish nuances of candidate SQL queries. The experimental results on multiple dialect datasets demonstrate the robustness of XiYan-SQL in addressing challenges across different scenarios. Overall, our proposed XiYan-SQL achieves the state-of-the-art execution accuracy of 89.65% on the Spider test set, 69.86% on SQL-Eval, 41.20% on NL2GQL, and a competitive score of 72.23% on the Bird development benchmark. The proposed framework not only enhances the quality and diversity of SQL queries but also outperforms previous methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08599v1-abstract-full').style.display = 'none'; document.getElementById('2411.08599v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2; H.2 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07933">arXiv:2411.07933</a> <span> [<a href="https://arxiv.org/pdf/2411.07933">pdf</a>, <a href="https://arxiv.org/format/2411.07933">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Prediction of Acoustic Communication Performance for AUVs using Gaussian Process Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yifei Gao</a>, <a href="/search/cs?searchtype=author&query=Yetkin%2C+H">Harun Yetkin</a>, <a href="/search/cs?searchtype=author&query=James%2C+M">McMahon James</a>, <a href="/search/cs?searchtype=author&query=Stilwell%2C+D+J">Daniel J. Stilwell</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07933v1-abstract-short" style="display: inline;"> Cooperating autonomous underwater vehicles (AUVs) often rely on acoustic communication to coordinate their actions effectively. However, the reliability of underwater acoustic communication decreases as the communication range between vehicles increases. Consequently, teams of cooperating AUVs typically make conservative assumptions about the maximum range at which they can communicate reliably. T… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07933v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07933v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07933v1-abstract-full" style="display: none;"> Cooperating autonomous underwater vehicles (AUVs) often rely on acoustic communication to coordinate their actions effectively. However, the reliability of underwater acoustic communication decreases as the communication range between vehicles increases. Consequently, teams of cooperating AUVs typically make conservative assumptions about the maximum range at which they can communicate reliably. To address this limitation, we propose a novel approach that involves learning a map representing the probability of successful communication based on the locations of the transmitting and receiving vehicles. This probabilistic communication map accounts for factors such as the range between vehicles, environmental noise, and multi-path effects at a given location. In pursuit of this goal, we investigate the application of Gaussian process binary classification to generate the desired communication map. We specialize existing results to this specific binary classification problem and explore methods to incorporate uncertainty in vehicle location into the mapping process. Furthermore, we compare the prediction performance of the probability communication map generated using binary classification with that of a signal-to-noise ratio (SNR) communication map generated using Gaussian process regression. Our approach is experimentally validated using communication and navigation data collected during trials with a pair of Virginia Tech 690 AUVs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07933v1-abstract-full').style.display = 'none'; document.getElementById('2411.07933v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05261">arXiv:2411.05261</a> <span> [<a href="https://arxiv.org/pdf/2411.05261">pdf</a>, <a href="https://arxiv.org/format/2411.05261">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Decoding Report Generators: A Cyclic Vision-Language Adapter for Counterfactual Explanations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fang%2C+Y">Yingying Fang</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+Z">Zihao Jin</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+S">Shaojie Guo</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jinda Liu</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yijian Gao</a>, <a href="/search/cs?searchtype=author&query=Ning%2C+J">Junzhi Ning</a>, <a href="/search/cs?searchtype=author&query=Yue%2C+Z">Zhiling Yue</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhi Li</a>, <a href="/search/cs?searchtype=author&query=Walsh%2C+S+L">Simon LF Walsh</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+G">Guang Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05261v1-abstract-short" style="display: inline;"> Despite significant advancements in report generation methods, a critical limitation remains: the lack of interpretability in the generated text. This paper introduces an innovative approach to enhance the explainability of text generated by report generation models. Our method employs cyclic text manipulation and visual comparison to identify and elucidate the features in the original content tha… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05261v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05261v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05261v1-abstract-full" style="display: none;"> Despite significant advancements in report generation methods, a critical limitation remains: the lack of interpretability in the generated text. This paper introduces an innovative approach to enhance the explainability of text generated by report generation models. Our method employs cyclic text manipulation and visual comparison to identify and elucidate the features in the original content that influence the generated text. By manipulating the generated reports and producing corresponding images, we create a comparative framework that highlights key attributes and their impact on the text generation process. This approach not only identifies the image features aligned to the generated text but also improves transparency but also provides deeper insights into the decision-making mechanisms of the report generation models. Our findings demonstrate the potential of this method to significantly enhance the interpretability and transparency of AI-generated reports. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05261v1-abstract-full').style.display = 'none'; document.getElementById('2411.05261v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05051">arXiv:2411.05051</a> <span> [<a href="https://arxiv.org/pdf/2411.05051">pdf</a>, <a href="https://arxiv.org/format/2411.05051">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Intellectual Property Protection for Deep Learning Model and Dataset Intelligence </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+Y">Yongqi Jiang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yansong Gao</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+C">Chunyi Zhou</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+H">Hongsheng Hu</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+A">Anmin Fu</a>, <a href="/search/cs?searchtype=author&query=Susilo%2C+W">Willy Susilo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05051v1-abstract-short" style="display: inline;"> With the growing applications of Deep Learning (DL), especially recent spectacular achievements of Large Language Models (LLMs) such as ChatGPT and LLaMA, the commercial significance of these remarkable models has soared. However, acquiring well-trained models is costly and resource-intensive. It requires a considerable high-quality dataset, substantial investment in dedicated architecture design,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05051v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05051v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05051v1-abstract-full" style="display: none;"> With the growing applications of Deep Learning (DL), especially recent spectacular achievements of Large Language Models (LLMs) such as ChatGPT and LLaMA, the commercial significance of these remarkable models has soared. However, acquiring well-trained models is costly and resource-intensive. It requires a considerable high-quality dataset, substantial investment in dedicated architecture design, expensive computational resources, and efforts to develop technical expertise. Consequently, safeguarding the Intellectual Property (IP) of well-trained models is attracting increasing attention. In contrast to existing surveys overwhelmingly focusing on model IPP mainly, this survey not only encompasses the protection on model level intelligence but also valuable dataset intelligence. Firstly, according to the requirements for effective IPP design, this work systematically summarizes the general and scheme-specific performance evaluation metrics. Secondly, from proactive IP infringement prevention and reactive IP ownership verification perspectives, it comprehensively investigates and analyzes the existing IPP methods for both dataset and model intelligence. Additionally, from the standpoint of training settings, it delves into the unique challenges that distributed settings pose to IPP compared to centralized settings. Furthermore, this work examines various attacks faced by deep IPP techniques. Finally, we outline prospects for promising future directions that may act as a guide for innovative research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05051v1-abstract-full').style.display = 'none'; document.getElementById('2411.05051v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04962">arXiv:2411.04962</a> <span> [<a href="https://arxiv.org/pdf/2411.04962">pdf</a>, <a href="https://arxiv.org/format/2411.04962">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Position Paper On Diagnostic Uncertainty Estimation from Large Language Models: Next-Word Probability Is Not Pre-test Probability </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yanjun Gao</a>, <a href="/search/cs?searchtype=author&query=Myers%2C+S">Skatje Myers</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Shan Chen</a>, <a href="/search/cs?searchtype=author&query=Dligach%2C+D">Dmitriy Dligach</a>, <a href="/search/cs?searchtype=author&query=Miller%2C+T+A">Timothy A Miller</a>, <a href="/search/cs?searchtype=author&query=Bitterman%2C+D">Danielle Bitterman</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+G">Guanhua Chen</a>, <a href="/search/cs?searchtype=author&query=Mayampurath%2C+A">Anoop Mayampurath</a>, <a href="/search/cs?searchtype=author&query=Churpek%2C+M">Matthew Churpek</a>, <a href="/search/cs?searchtype=author&query=Afshar%2C+M">Majid Afshar</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04962v1-abstract-short" style="display: inline;"> Large language models (LLMs) are being explored for diagnostic decision support, yet their ability to estimate pre-test probabilities, vital for clinical decision-making, remains limited. This study evaluates two LLMs, Mistral-7B and Llama3-70B, using structured electronic health record data on three diagnosis tasks. We examined three current methods of extracting LLM probability estimations and r… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04962v1-abstract-full').style.display = 'inline'; document.getElementById('2411.04962v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04962v1-abstract-full" style="display: none;"> Large language models (LLMs) are being explored for diagnostic decision support, yet their ability to estimate pre-test probabilities, vital for clinical decision-making, remains limited. This study evaluates two LLMs, Mistral-7B and Llama3-70B, using structured electronic health record data on three diagnosis tasks. We examined three current methods of extracting LLM probability estimations and revealed their limitations. We aim to highlight the need for improved techniques in LLM confidence estimation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04962v1-abstract-full').style.display = 'none'; document.getElementById('2411.04962v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to GenAI4Health Workshop at NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04424">arXiv:2411.04424</a> <span> [<a href="https://arxiv.org/pdf/2411.04424">pdf</a>, <a href="https://arxiv.org/format/2411.04424">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Bayesian Calibration of Win Rate Estimation with LLM Evaluators </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yicheng Gao</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+G">Gonghan Xu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhe Wang</a>, <a href="/search/cs?searchtype=author&query=Cohan%2C+A">Arman Cohan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04424v1-abstract-short" style="display: inline;"> Recent advances in large language models (LLMs) show the potential of using LLMs as evaluators for assessing the quality of text generations from LLMs. However, applying LLM evaluators naively to compare or judge between different systems can lead to unreliable results due to the intrinsic win rate estimation bias of LLM evaluators. In order to mitigate this problem, we propose two calibration met… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04424v1-abstract-full').style.display = 'inline'; document.getElementById('2411.04424v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04424v1-abstract-full" style="display: none;"> Recent advances in large language models (LLMs) show the potential of using LLMs as evaluators for assessing the quality of text generations from LLMs. However, applying LLM evaluators naively to compare or judge between different systems can lead to unreliable results due to the intrinsic win rate estimation bias of LLM evaluators. In order to mitigate this problem, we propose two calibration methods, Bayesian Win Rate Sampling (BWRS) and Bayesian Dawid-Skene, both of which leverage Bayesian inference to more accurately infer the true win rate of generative language models. We empirically validate our methods on six datasets covering story generation, summarization, and instruction following tasks. We show that both our methods are effective in improving the accuracy of win rate estimation using LLMs as evaluators, offering a promising direction for reliable automatic text quality evaluation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04424v1-abstract-full').style.display = 'none'; document.getElementById('2411.04424v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by EMNLP 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03697">arXiv:2411.03697</a> <span> [<a href="https://arxiv.org/pdf/2411.03697">pdf</a>, <a href="https://arxiv.org/format/2411.03697">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> </div> </div> <p class="title is-5 mathjax"> TATAA: Programmable Mixed-Precision Transformer Acceleration with a Transformable Arithmetic Architecture </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jiajun Wu</a>, <a href="/search/cs?searchtype=author&query=Song%2C+M">Mo Song</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+J">Jingmin Zhao</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yizhao Gao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jia Li</a>, <a href="/search/cs?searchtype=author&query=So%2C+H+K">Hayden Kwok-Hay So</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03697v1-abstract-short" style="display: inline;"> Modern transformer-based deep neural networks present unique technical challenges for effective acceleration in real-world applications. Apart from the vast amount of linear operations needed due to their sizes, modern transformer models are increasingly reliance on precise non-linear computations that make traditional low-bitwidth quantization methods and fixed-dataflow matrix accelerators ineffe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03697v1-abstract-full').style.display = 'inline'; document.getElementById('2411.03697v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03697v1-abstract-full" style="display: none;"> Modern transformer-based deep neural networks present unique technical challenges for effective acceleration in real-world applications. Apart from the vast amount of linear operations needed due to their sizes, modern transformer models are increasingly reliance on precise non-linear computations that make traditional low-bitwidth quantization methods and fixed-dataflow matrix accelerators ineffective for end-to-end acceleration. To address this need to accelerate both linear and non-linear operations in a unified and programmable framework, this paper introduces TATAA. TATAA employs 8-bit integer (int8) arithmetic for quantized linear layer operations through post-training quantization, while it relies on bfloat16 floating-point arithmetic to approximate non-linear layers of a transformer model. TATAA hardware features a transformable arithmetic architecture that supports both formats during runtime with minimal overhead, enabling it to switch between a systolic array mode for int8 matrix multiplications and a SIMD mode for vectorized bfloat16 operations. An end-to-end compiler is presented to enable flexible mapping from emerging transformer models to the proposed hardware. Experimental results indicate that our mixed-precision design incurs only 0.14% to 1.16% accuracy drop when compared with the pre-trained single-precision transformer models across a range of vision, language, and generative text applications. Our prototype implementation on the Alveo U280 FPGA currently achieves 2935.2 GOPS throughput on linear layers and a maximum of 189.5 GFLOPS for non-linear operations, outperforming related works by up to 1.45x in end-to-end throughput and 2.29x in DSP efficiency, while achieving 2.19x higher power efficiency than modern NVIDIA RTX4090 GPU. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03697v1-abstract-full').style.display = 'none'; document.getElementById('2411.03697v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02908">arXiv:2411.02908</a> <span> [<a href="https://arxiv.org/pdf/2411.02908">pdf</a>, <a href="https://arxiv.org/format/2411.02908">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> Photon: Federated LLM Pre-Training </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sani%2C+L">Lorenzo Sani</a>, <a href="/search/cs?searchtype=author&query=Iacob%2C+A">Alex Iacob</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+Z">Zeyu Cao</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+R">Royson Lee</a>, <a href="/search/cs?searchtype=author&query=Marino%2C+B">Bill Marino</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yan Gao</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+D">Dongqi Cai</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zexi Li</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+W">Wanru Zhao</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+X">Xinchi Qiu</a>, <a href="/search/cs?searchtype=author&query=Lane%2C+N+D">Nicholas D. Lane</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02908v1-abstract-short" style="display: inline;"> Scaling large language models (LLMs) demands extensive data and computing resources, which are traditionally constrained to data centers by the high-bandwidth requirements of distributed training. Low-bandwidth methods like federated learning (FL) could enable collaborative training of larger models across weakly-connected GPUs if they can effectively be used for pre-training. To achieve this, we… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02908v1-abstract-full').style.display = 'inline'; document.getElementById('2411.02908v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02908v1-abstract-full" style="display: none;"> Scaling large language models (LLMs) demands extensive data and computing resources, which are traditionally constrained to data centers by the high-bandwidth requirements of distributed training. Low-bandwidth methods like federated learning (FL) could enable collaborative training of larger models across weakly-connected GPUs if they can effectively be used for pre-training. To achieve this, we introduce Photon, the first complete system for federated end-to-end LLM training, leveraging cross-silo FL for global-scale training with minimal communication overheads. Using Photon, we train the first federated family of decoder-only LLMs from scratch. We show that: (1) Photon can train model sizes up to 7B in a federated fashion while reaching an even better perplexity than centralized pre-training; (2) Photon model training time decreases with available compute, achieving a similar compute-time trade-off to centralized; and (3) Photon outperforms the wall-time of baseline distributed training methods by 35% via communicating 64x-512xless. Our proposal is robust to data heterogeneity and converges twice as fast as previous methods like DiLoCo. This surprising data efficiency stems from a unique approach combining small client batch sizes with extremely high learning rates, enabled by federated averaging's robustness to hyperparameters. Photon thus represents the first economical system for global internet-wide LLM pre-training. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02908v1-abstract-full').style.display = 'none'; document.getElementById('2411.02908v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 9 appendix pages, 10 figures, 3 algorithms, 8 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02673">arXiv:2411.02673</a> <span> [<a href="https://arxiv.org/pdf/2411.02673">pdf</a>, <a href="https://arxiv.org/format/2411.02673">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Multi-Transmotion: Pre-trained Model for Human Motion Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yang Gao</a>, <a href="/search/cs?searchtype=author&query=Luan%2C+P">Po-Chien Luan</a>, <a href="/search/cs?searchtype=author&query=Alahi%2C+A">Alexandre Alahi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02673v1-abstract-short" style="display: inline;"> The ability of intelligent systems to predict human behaviors is crucial, particularly in fields such as autonomous vehicle navigation and social robotics. However, the complexity of human motion have prevented the development of a standardized dataset for human motion prediction, thereby hindering the establishment of pre-trained models. In this paper, we address these limitations by integrating… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02673v1-abstract-full').style.display = 'inline'; document.getElementById('2411.02673v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02673v1-abstract-full" style="display: none;"> The ability of intelligent systems to predict human behaviors is crucial, particularly in fields such as autonomous vehicle navigation and social robotics. However, the complexity of human motion have prevented the development of a standardized dataset for human motion prediction, thereby hindering the establishment of pre-trained models. In this paper, we address these limitations by integrating multiple datasets, encompassing both trajectory and 3D pose keypoints, to propose a pre-trained model for human motion prediction. We merge seven distinct datasets across varying modalities and standardize their formats. To facilitate multimodal pre-training, we introduce Multi-Transmotion, an innovative transformer-based model designed for cross-modality pre-training. Additionally, we present a novel masking strategy to capture rich representations. Our methodology demonstrates competitive performance across various datasets on several downstream tasks, including trajectory prediction in the NBA and JTA datasets, as well as pose prediction in the AMASS and 3DPW datasets. The code is publicly available: https://github.com/vita-epfl/multi-transmotion <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02673v1-abstract-full').style.display = 'none'; document.getElementById('2411.02673v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CoRL 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00900">arXiv:2411.00900</a> <span> [<a href="https://arxiv.org/pdf/2411.00900">pdf</a>, <a href="https://arxiv.org/format/2411.00900">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Intensity Field Decomposition for Tissue-Guided Neural Tomography </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+M">Meng-Xun Li</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+J">Jin-Gang Yu</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yuan Gao</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+C">Cui Huang</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+G">Gui-Song Xia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00900v1-abstract-short" style="display: inline;"> Cone-beam computed tomography (CBCT) typically requires hundreds of X-ray projections, which raises concerns about radiation exposure. While sparse-view reconstruction reduces the exposure by using fewer projections, it struggles to achieve satisfactory image quality. To address this challenge, this article introduces a novel sparse-view CBCT reconstruction method, which empowers the neural field… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00900v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00900v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00900v1-abstract-full" style="display: none;"> Cone-beam computed tomography (CBCT) typically requires hundreds of X-ray projections, which raises concerns about radiation exposure. While sparse-view reconstruction reduces the exposure by using fewer projections, it struggles to achieve satisfactory image quality. To address this challenge, this article introduces a novel sparse-view CBCT reconstruction method, which empowers the neural field with human tissue regularization. Our approach, termed tissue-guided neural tomography (TNT), is motivated by the distinct intensity differences between bone and soft tissue in CBCT. Intuitively, separating these components may aid the learning process of the neural field. More precisely, TNT comprises a heterogeneous quadruple network and the corresponding training strategy. The network represents the intensity field as a combination of soft and hard tissue components, along with their respective textures. We train the network with guidance from estimated tissue projections, enabling efficient learning of the desired patterns for the network heads. Extensive experiments demonstrate that the proposed method significantly improves the sparse-view CBCT reconstruction with a limited number of projections ranging from 10 to 60. Our method achieves comparable reconstruction quality with fewer projections and faster convergence compared to state-of-the-art neural rendering based methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00900v1-abstract-full').style.display = 'none'; document.getElementById('2411.00900v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00442">arXiv:2411.00442</a> <span> [<a href="https://arxiv.org/pdf/2411.00442">pdf</a>, <a href="https://arxiv.org/format/2411.00442">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Numerical Analysis">math.NA</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> FPRev: Revealing the Order of Floating-Point Summation by Numerical Testing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xie%2C+P">Peichen Xie</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yanjie Gao</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+J">Jilong Xue</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00442v1-abstract-short" style="display: inline;"> The order of floating-point summation is a key factor in numerical reproducibility. However, this critical information is generally unspecified and unknown for most summation-based functions in numerical libraries, making it challenging to migrate them to new environments reproducibly. This paper presents novel, non-intrusive, testing-based algorithms that can reveal the order of floating-point su… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00442v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00442v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00442v1-abstract-full" style="display: none;"> The order of floating-point summation is a key factor in numerical reproducibility. However, this critical information is generally unspecified and unknown for most summation-based functions in numerical libraries, making it challenging to migrate them to new environments reproducibly. This paper presents novel, non-intrusive, testing-based algorithms that can reveal the order of floating-point summation by treating functions as callable black boxes. By constructing well-designed input that can cause the swamping phenomenon of floating-point addition, we can infer the order of summation from the output. We introduce FPRev, a tool that implements these algorithms, and validate its efficiency through extensive experiments with popular numerical libraries on various CPUs and GPUs (including those with Tensor Cores). FPRev reveals the varying summation orders across different libraries and devices, and outperforms other methods in terms of time complexity. The source code of FPRev is at \url{https://github.com/microsoft/RepDL/tree/main/tools/FPRev}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00442v1-abstract-full').style.display = 'none'; document.getElementById('2411.00442v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.24039">arXiv:2410.24039</a> <span> [<a href="https://arxiv.org/pdf/2410.24039">pdf</a>, <a href="https://arxiv.org/format/2410.24039">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Efficient Satellite-Ground Interconnection Design for Low-orbit Mega-Constellation Topology </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+W">Wenhao Liu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jiazhi Wu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Q">Quanwei Lin</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+H">Handong Luo</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Qi Zhang</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+K">Kun Qiu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhe Chen</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yue Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.24039v1-abstract-short" style="display: inline;"> The low-orbit mega-constellation network (LMCN) is an important part of the space-air-ground integrated network system. An effective satellite-ground interconnection design can result in a stable constellation topology for LMCNs. A naive solution is accessing the satellite with the longest remaining service time (LRST), which is widely used in previous designs. The Coordinated Satellite-Ground Int… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.24039v1-abstract-full').style.display = 'inline'; document.getElementById('2410.24039v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.24039v1-abstract-full" style="display: none;"> The low-orbit mega-constellation network (LMCN) is an important part of the space-air-ground integrated network system. An effective satellite-ground interconnection design can result in a stable constellation topology for LMCNs. A naive solution is accessing the satellite with the longest remaining service time (LRST), which is widely used in previous designs. The Coordinated Satellite-Ground Interconnecting (CSGI), the state-of-the-art algorithm, coordinates the establishment of ground-satellite links (GSLs). Compared with existing solutions, it reduces latency by 19% and jitter by 70% on average. However, CSGI only supports the scenario where terminals access only one satellite and cannot fully utilize the multi-access capabilities of terminals. Additionally, CSGI's high computational complexity poses deployment challenges. To overcome these problems, we propose the Classification-based Longest Remaining Service Time (C-LRST) algorithm. C-LRST supports the actual scenario with multi-access capabilities. It adds optional paths during routing with low computational complexity, improving end-to-end communications quality. We conduct our 1000s simulation from Brazil to Lithuania on the open-source platform Hypatia. Experiment results show that compared with CSGI, C-LRST reduces the latency and increases the throughput by approximately 60% and 40%, respectively. In addition, C-LRST's GSL switching number is 14, whereas CSGI is 23. C-LRST has better link stability than CSGI. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.24039v1-abstract-full').style.display = 'none'; document.getElementById('2410.24039v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 14 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23515">arXiv:2410.23515</a> <span> [<a href="https://arxiv.org/pdf/2410.23515">pdf</a>, <a href="https://arxiv.org/format/2410.23515">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neurons and Cognition">q-bio.NC</span> </div> </div> <p class="title is-5 mathjax"> Generative forecasting of brain activity enhances Alzheimer's classification and interpretation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yutong Gao</a>, <a href="/search/cs?searchtype=author&query=Calhoun%2C+V+D">Vince D. Calhoun</a>, <a href="/search/cs?searchtype=author&query=Miller%2C+R+L">Robyn L. Miller</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23515v1-abstract-short" style="display: inline;"> Understanding the relationship between cognition and intrinsic brain activity through purely data-driven approaches remains a significant challenge in neuroscience. Resting-state functional magnetic resonance imaging (rs-fMRI) offers a non-invasive method to monitor regional neural activity, providing a rich and complex spatiotemporal data structure. Deep learning has shown promise in capturing th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23515v1-abstract-full').style.display = 'inline'; document.getElementById('2410.23515v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23515v1-abstract-full" style="display: none;"> Understanding the relationship between cognition and intrinsic brain activity through purely data-driven approaches remains a significant challenge in neuroscience. Resting-state functional magnetic resonance imaging (rs-fMRI) offers a non-invasive method to monitor regional neural activity, providing a rich and complex spatiotemporal data structure. Deep learning has shown promise in capturing these intricate representations. However, the limited availability of large datasets, especially for disease-specific groups such as Alzheimer's Disease (AD), constrains the generalizability of deep learning models. In this study, we focus on multivariate time series forecasting of independent component networks derived from rs-fMRI as a form of data augmentation, using both a conventional LSTM-based model and the novel Transformer-based BrainLM model. We assess their utility in AD classification, demonstrating how generative forecasting enhances classification performance. Post-hoc interpretation of BrainLM reveals class-specific brain network sensitivities associated with AD. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23515v1-abstract-full').style.display = 'none'; document.getElementById('2410.23515v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.22139">arXiv:2410.22139</a> <span> [<a href="https://arxiv.org/pdf/2410.22139">pdf</a>, <a href="https://arxiv.org/format/2410.22139">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Lighten CARAFE: Dynamic Lightweight Upsampling with Guided Reassemble Kernels </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fu%2C+R">Ruigang Fu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Q">Qingyong Hu</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+X">Xiaohu Dong</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yinghui Gao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Biao Li</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+P">Ping Zhong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.22139v1-abstract-short" style="display: inline;"> As a fundamental operation in modern machine vision models, feature upsampling has been widely used and investigated in the literatures. An ideal upsampling operation should be lightweight, with low computational complexity. That is, it can not only improve the overall performance but also not affect the model complexity. Content-aware Reassembly of Features (CARAFE) is a well-designed learnable o… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22139v1-abstract-full').style.display = 'inline'; document.getElementById('2410.22139v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.22139v1-abstract-full" style="display: none;"> As a fundamental operation in modern machine vision models, feature upsampling has been widely used and investigated in the literatures. An ideal upsampling operation should be lightweight, with low computational complexity. That is, it can not only improve the overall performance but also not affect the model complexity. Content-aware Reassembly of Features (CARAFE) is a well-designed learnable operation to achieve feature upsampling. Albeit encouraging performance achieved, this method requires generating large-scale kernels, which brings a mass of extra redundant parameters, and inherently has limited scalability. To this end, we propose a lightweight upsampling operation, termed Dynamic Lightweight Upsampling (DLU) in this paper. In particular, it first constructs a small-scale source kernel space, and then samples the large-scale kernels from the kernel space by introducing learnable guidance offsets, hence avoiding introducing a large collection of trainable parameters in upsampling. Experiments on several mainstream vision tasks show that our DLU achieves comparable and even better performance to the original CARAFE, but with much lower complexity, e.g., DLU requires 91% fewer parameters and at least 63% fewer FLOPs (Floating Point Operations) than CARAFE in the case of 16x upsampling, but outperforms the CARAFE by 0.3% mAP in object detection. Code is available at https://github.com/Fu0511/Dynamic-Lightweight-Upsampling. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22139v1-abstract-full').style.display = 'none'; document.getElementById('2410.22139v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at ICPR 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.21872">arXiv:2410.21872</a> <span> [<a href="https://arxiv.org/pdf/2410.21872">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Advancing Efficient Brain Tumor Multi-Class Classification -- New Insights from the Vision Mamba Model in Transfer Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lai%2C+Y">Yinyi Lai</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+A">Anbo Cao</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yuan Gao</a>, <a href="/search/cs?searchtype=author&query=Shang%2C+J">Jiaqi Shang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zongyu Li</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+J">Jia Guo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.21872v2-abstract-short" style="display: inline;"> Early and accurate diagnosis of brain tumors is crucial for improving patient survival rates. However, the detection and classification of brain tumors are challenging due to their diverse types and complex morphological characteristics. This study investigates the application of pre-trained models for brain tumor classification, with a particular focus on deploying the Mamba model. We fine-tuned… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21872v2-abstract-full').style.display = 'inline'; document.getElementById('2410.21872v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.21872v2-abstract-full" style="display: none;"> Early and accurate diagnosis of brain tumors is crucial for improving patient survival rates. However, the detection and classification of brain tumors are challenging due to their diverse types and complex morphological characteristics. This study investigates the application of pre-trained models for brain tumor classification, with a particular focus on deploying the Mamba model. We fine-tuned several mainstream transfer learning models and applied them to the multi-class classification of brain tumors. By comparing these models to those trained from scratch, we demonstrated the significant advantages of transfer learning, especially in the medical imaging field, where annotated data is often limited. Notably, we introduced the Vision Mamba (Vim), a novel network architecture, and applied it for the first time in brain tumor classification, achieving exceptional classification accuracy. Experimental results indicate that the Vim model achieved 100% classification accuracy on an independent test set, emphasizing its potential for tumor classification tasks. These findings underscore the effectiveness of transfer learning in brain tumor classification and reveal that, compared to existing state-of-the-art models, the Vim model is lightweight, efficient, and highly accurate, offering a new perspective for clinical applications. Furthermore, the framework proposed in this study for brain tumor classification, based on transfer learning and the Vision Mamba model, is broadly applicable to other medical imaging classification problems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21872v2-abstract-full').style.display = 'none'; document.getElementById('2410.21872v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.21351">arXiv:2410.21351</a> <span> [<a href="https://arxiv.org/pdf/2410.21351">pdf</a>, <a href="https://arxiv.org/format/2410.21351">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> LinFormer: A Linear-based Lightweight Transformer Architecture For Time-Aware MIMO Channel Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jin%2C+Y">Yanliang Jin</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yifan Wu</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yuan Gao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shunqing Zhang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+S">Shugong Xu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Cheng-Xiang Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.21351v1-abstract-short" style="display: inline;"> The emergence of 6th generation (6G) mobile networks brings new challenges in supporting high-mobility communications, particularly in addressing the issue of channel aging. While existing channel prediction methods offer improved accuracy at the expense of increased computational complexity, limiting their practical application in mobile networks. To address these challenges, we present LinFormer… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21351v1-abstract-full').style.display = 'inline'; document.getElementById('2410.21351v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.21351v1-abstract-full" style="display: none;"> The emergence of 6th generation (6G) mobile networks brings new challenges in supporting high-mobility communications, particularly in addressing the issue of channel aging. While existing channel prediction methods offer improved accuracy at the expense of increased computational complexity, limiting their practical application in mobile networks. To address these challenges, we present LinFormer, an innovative channel prediction framework based on a scalable, all-linear, encoder-only Transformer model. Our approach, inspired by natural language processing (NLP) models such as BERT, adapts an encoder-only architecture specifically for channel prediction tasks. We propose replacing the computationally intensive attention mechanism commonly used in Transformers with a time-aware multi-layer perceptron (TMLP), significantly reducing computational demands. The inherent time awareness of TMLP module makes it particularly suitable for channel prediction tasks. We enhance LinFormer's training process by employing a weighted mean squared error loss (WMSELoss) function and data augmentation techniques, leveraging larger, readily available communication datasets. Our approach achieves a substantial reduction in computational complexity while maintaining high prediction accuracy, making it more suitable for deployment in cost-effective base stations (BS). Comprehensive experiments using both simulated and measured data demonstrate that LinFormer outperforms existing methods across various mobility scenarios, offering a promising solution for future wireless communication systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21351v1-abstract-full').style.display = 'none'; document.getElementById('2410.21351v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.21287">arXiv:2410.21287</a> <span> [<a href="https://arxiv.org/pdf/2410.21287">pdf</a>, <a href="https://arxiv.org/format/2410.21287">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> A Systematic Assessment of OpenAI o1-Preview for Higher Order Thinking in Education </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Latif%2C+E">Ehsan Latif</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yifan Zhou</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+S">Shuchen Guo</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yizhu Gao</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+L">Lehong Shi</a>, <a href="/search/cs?searchtype=author&query=Nayaaba%2C+M">Matthew Nayaaba</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+G">Gyeonggeon Lee</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Liang Zhang</a>, <a href="/search/cs?searchtype=author&query=Bewersdorff%2C+A">Arne Bewersdorff</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+L">Luyang Fang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xiantong Yang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Huaqin Zhao</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+H">Hanqi Jiang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Haoran Lu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jiaxi Li</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+J">Jichao Yu</a>, <a href="/search/cs?searchtype=author&query=You%2C+W">Weihang You</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhengliang Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+V+S">Vincent Shung Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hui Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Zihao Wu</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+J">Jin Lu</a>, <a href="/search/cs?searchtype=author&query=Dou%2C+F">Fei Dou</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+P">Ping Ma</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+N">Ninghao Liu</a> , et al. (2 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.21287v1-abstract-short" style="display: inline;"> As artificial intelligence (AI) continues to advance, it demonstrates capabilities comparable to human intelligence, with significant potential to transform education and workforce development. This study evaluates OpenAI o1-preview's ability to perform higher-order cognitive tasks across 14 dimensions, including critical thinking, systems thinking, computational thinking, design thinking, metacog… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21287v1-abstract-full').style.display = 'inline'; document.getElementById('2410.21287v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.21287v1-abstract-full" style="display: none;"> As artificial intelligence (AI) continues to advance, it demonstrates capabilities comparable to human intelligence, with significant potential to transform education and workforce development. This study evaluates OpenAI o1-preview's ability to perform higher-order cognitive tasks across 14 dimensions, including critical thinking, systems thinking, computational thinking, design thinking, metacognition, data literacy, creative thinking, abstract reasoning, quantitative reasoning, logical reasoning, analogical reasoning, and scientific reasoning. We used validated instruments like the Ennis-Weir Critical Thinking Essay Test and the Biological Systems Thinking Test to compare the o1-preview's performance with human performance systematically. Our findings reveal that o1-preview outperforms humans in most categories, achieving 150% better results in systems thinking, computational thinking, data literacy, creative thinking, scientific reasoning, and abstract reasoning. However, compared to humans, it underperforms by around 25% in logical reasoning, critical thinking, and quantitative reasoning. In analogical reasoning, both o1-preview and humans achieved perfect scores. Despite these strengths, the o1-preview shows limitations in abstract reasoning, where human psychology students outperform it, highlighting the continued importance of human oversight in tasks requiring high-level abstraction. These results have significant educational implications, suggesting a shift toward developing human skills that complement AI, such as creativity, abstract reasoning, and critical thinking. This study emphasizes the transformative potential of AI in education and calls for a recalibration of educational goals, teaching methods, and curricula to align with an AI-driven world. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21287v1-abstract-full').style.display = 'none'; document.getElementById('2410.21287v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">An assessment of OpenAI o1-Preview for Higher Order Thinking in Education</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20786">arXiv:2410.20786</a> <span> [<a href="https://arxiv.org/pdf/2410.20786">pdf</a>, <a href="https://arxiv.org/format/2410.20786">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Adversarial Constrained Policy Optimization: Improving Constrained Reinforcement Learning by Adapting Budgets </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ma%2C+J">Jianmina Ma</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+J">Jingtian Ji</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yue Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20786v1-abstract-short" style="display: inline;"> Constrained reinforcement learning has achieved promising progress in safety-critical fields where both rewards and constraints are considered. However, constrained reinforcement learning methods face challenges in striking the right balance between task performance and constraint satisfaction and it is prone for them to get stuck in over-conservative or constraint violating local minima. In this… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20786v1-abstract-full').style.display = 'inline'; document.getElementById('2410.20786v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20786v1-abstract-full" style="display: none;"> Constrained reinforcement learning has achieved promising progress in safety-critical fields where both rewards and constraints are considered. However, constrained reinforcement learning methods face challenges in striking the right balance between task performance and constraint satisfaction and it is prone for them to get stuck in over-conservative or constraint violating local minima. In this paper, we propose Adversarial Constrained Policy Optimization (ACPO), which enables simultaneous optimization of reward and the adaptation of cost budgets during training. Our approach divides original constrained problem into two adversarial stages that are solved alternately, and the policy update performance of our algorithm can be theoretically guaranteed. We validate our method through experiments conducted on Safety Gymnasium and quadruped locomotion tasks. Results demonstrate that our algorithm achieves better performances compared to commonly used baselines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20786v1-abstract-full').style.display = 'none'; document.getElementById('2410.20786v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">21 pages, 8 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68T01 <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.6 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20745">arXiv:2410.20745</a> <span> [<a href="https://arxiv.org/pdf/2410.20745">pdf</a>, <a href="https://arxiv.org/format/2410.20745">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Shopping MMLU: A Massive Multi-Task Online Shopping Benchmark for Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jin%2C+Y">Yilun Jin</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zheng Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chenwei Zhang</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+T">Tianyu Cao</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yifan Gao</a>, <a href="/search/cs?searchtype=author&query=Jayarao%2C+P">Pratik Jayarao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Mao Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xin Liu</a>, <a href="/search/cs?searchtype=author&query=Sarkhel%2C+R">Ritesh Sarkhel</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+X">Xianfeng Tang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haodong Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhengyang Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+W">Wenju Xu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jingfeng Yang</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+Q">Qingyu Yin</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xian Li</a>, <a href="/search/cs?searchtype=author&query=Nigam%2C+P">Priyanka Nigam</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yi Xu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+K">Kai Chen</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Q">Qiang Yang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+M">Meng Jiang</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+B">Bing Yin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20745v2-abstract-short" style="display: inline;"> Online shopping is a complex multi-task, few-shot learning problem with a wide and evolving range of entities, relations, and tasks. However, existing models and benchmarks are commonly tailored to specific tasks, falling short of capturing the full complexity of online shopping. Large Language Models (LLMs), with their multi-task and few-shot learning abilities, have the potential to profoundly t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20745v2-abstract-full').style.display = 'inline'; document.getElementById('2410.20745v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20745v2-abstract-full" style="display: none;"> Online shopping is a complex multi-task, few-shot learning problem with a wide and evolving range of entities, relations, and tasks. However, existing models and benchmarks are commonly tailored to specific tasks, falling short of capturing the full complexity of online shopping. Large Language Models (LLMs), with their multi-task and few-shot learning abilities, have the potential to profoundly transform online shopping by alleviating task-specific engineering efforts and by providing users with interactive conversations. Despite the potential, LLMs face unique challenges in online shopping, such as domain-specific concepts, implicit knowledge, and heterogeneous user behaviors. Motivated by the potential and challenges, we propose Shopping MMLU, a diverse multi-task online shopping benchmark derived from real-world Amazon data. Shopping MMLU consists of 57 tasks covering 4 major shopping skills: concept understanding, knowledge reasoning, user behavior alignment, and multi-linguality, and can thus comprehensively evaluate the abilities of LLMs as general shop assistants. With Shopping MMLU, we benchmark over 20 existing LLMs and uncover valuable insights about practices and prospects of building versatile LLM-based shop assistants. Shopping MMLU can be publicly accessed at https://github.com/KL4805/ShoppingMMLU. In addition, with Shopping MMLU, we host a competition in KDD Cup 2024 with over 500 participating teams. The winning solutions and the associated workshop can be accessed at our website https://amazon-kddcup24.github.io/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20745v2-abstract-full').style.display = 'none'; document.getElementById('2410.20745v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2024 Datasets and Benchmarks Track Accepted. Modified typos in Figure 9</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20733">arXiv:2410.20733</a> <span> [<a href="https://arxiv.org/pdf/2410.20733">pdf</a>, <a href="https://arxiv.org/format/2410.20733">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> SEG:Seeds-Enhanced Iterative Refinement Graph Neural Network for Entity Alignment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ai%2C+W">Wei Ai</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yinghui Gao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jianbin Li</a>, <a href="/search/cs?searchtype=author&query=Du%2C+J">Jiayi Du</a>, <a href="/search/cs?searchtype=author&query=Meng%2C+T">Tao Meng</a>, <a href="/search/cs?searchtype=author&query=Shou%2C+Y">Yuntao Shou</a>, <a href="/search/cs?searchtype=author&query=Li%2C+K">Keqin Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20733v1-abstract-short" style="display: inline;"> Entity alignment is crucial for merging knowledge across knowledge graphs, as it matches entities with identical semantics. The standard method matches these entities based on their embedding similarities using semi-supervised learning. However, diverse data sources lead to non-isomorphic neighborhood structures for aligned entities, complicating alignment, especially for less common and sparsely… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20733v1-abstract-full').style.display = 'inline'; document.getElementById('2410.20733v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20733v1-abstract-full" style="display: none;"> Entity alignment is crucial for merging knowledge across knowledge graphs, as it matches entities with identical semantics. The standard method matches these entities based on their embedding similarities using semi-supervised learning. However, diverse data sources lead to non-isomorphic neighborhood structures for aligned entities, complicating alignment, especially for less common and sparsely connected entities. This paper presents a soft label propagation framework that integrates multi-source data and iterative seed enhancement, addressing scalability challenges in handling extensive datasets where scale computing excels. The framework uses seeds for anchoring and selects optimal relationship pairs to create soft labels rich in neighborhood features and semantic relationship data. A bidirectional weighted joint loss function is implemented, which reduces the distance between positive samples and differentially processes negative samples, taking into account the non-isomorphic neighborhood structures. Our method outperforms existing semi-supervised approaches, as evidenced by superior results on multiple datasets, significantly improving the quality of entity alignment. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20733v1-abstract-full').style.display = 'none'; document.getElementById('2410.20733v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7, 2 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20514">arXiv:2410.20514</a> <span> [<a href="https://arxiv.org/pdf/2410.20514">pdf</a>, <a href="https://arxiv.org/format/2410.20514">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Uncertainty-Aware Decision-Making and Planning for Autonomous Forced Merging </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jian Zhou</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yulong Gao</a>, <a href="/search/cs?searchtype=author&query=Olofsson%2C+B">Bj枚rn Olofsson</a>, <a href="/search/cs?searchtype=author&query=Frisk%2C+E">Erik Frisk</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20514v1-abstract-short" style="display: inline;"> In this paper, we develop an uncertainty-aware decision-making and motion-planning method for an autonomous ego vehicle in forced merging scenarios, considering the motion uncertainty of surrounding vehicles. The method dynamically captures the uncertainty of surrounding vehicles by online estimation of their acceleration bounds, enabling a reactive but rapid understanding of the uncertainty chara… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20514v1-abstract-full').style.display = 'inline'; document.getElementById('2410.20514v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20514v1-abstract-full" style="display: none;"> In this paper, we develop an uncertainty-aware decision-making and motion-planning method for an autonomous ego vehicle in forced merging scenarios, considering the motion uncertainty of surrounding vehicles. The method dynamically captures the uncertainty of surrounding vehicles by online estimation of their acceleration bounds, enabling a reactive but rapid understanding of the uncertainty characteristics of the surrounding vehicles. By leveraging these estimated bounds, a non-conservative forward occupancy of surrounding vehicles is predicted over a horizon, which is incorporated in both the decision-making process and the motion-planning strategy, to enhance the resilience and safety of the planned reference trajectory. The method successfully fulfills the tasks in challenging forced merging scenarios, and the properties are illustrated by comparison with several alternative approaches. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20514v1-abstract-full').style.display = 'none'; document.getElementById('2410.20514v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by the 63rd IEEE Conference on Decision and Control, 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20508">arXiv:2410.20508</a> <span> [<a href="https://arxiv.org/pdf/2410.20508">pdf</a>, <a href="https://arxiv.org/format/2410.20508">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Referring Human Pose and Mask Estimation in the Wild </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Miao%2C+B">Bo Miao</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+M">Mingtao Feng</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Zijie Wu</a>, <a href="/search/cs?searchtype=author&query=Bennamoun%2C+M">Mohammed Bennamoun</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yongsheng Gao</a>, <a href="/search/cs?searchtype=author&query=Mian%2C+A">Ajmal Mian</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20508v1-abstract-short" style="display: inline;"> We introduce Referring Human Pose and Mask Estimation (R-HPM) in the wild, where either a text or positional prompt specifies the person of interest in an image. This new task holds significant potential for human-centric applications such as assistive robotics and sports analysis. In contrast to previous works, R-HPM (i) ensures high-quality, identity-aware results corresponding to the referred p… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20508v1-abstract-full').style.display = 'inline'; document.getElementById('2410.20508v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20508v1-abstract-full" style="display: none;"> We introduce Referring Human Pose and Mask Estimation (R-HPM) in the wild, where either a text or positional prompt specifies the person of interest in an image. This new task holds significant potential for human-centric applications such as assistive robotics and sports analysis. In contrast to previous works, R-HPM (i) ensures high-quality, identity-aware results corresponding to the referred person, and (ii) simultaneously predicts human pose and mask for a comprehensive representation. To achieve this, we introduce a large-scale dataset named RefHuman, which substantially extends the MS COCO dataset with additional text and positional prompt annotations. RefHuman includes over 50,000 annotated instances in the wild, each equipped with keypoint, mask, and prompt annotations. To enable prompt-conditioned estimation, we propose the first end-to-end promptable approach named UniPHD for R-HPM. UniPHD extracts multimodal representations and employs a proposed pose-centric hierarchical decoder to process (text or positional) instance queries and keypoint queries, producing results specific to the referred person. Extensive experiments demonstrate that UniPHD produces quality results based on user-friendly prompts and achieves top-tier performance on RefHuman val and MS COCO val2017. Data and Code: https://github.com/bo-miao/RefHuman <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20508v1-abstract-full').style.display = 'none'; document.getElementById('2410.20508v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by NeurIPS 2024. https://github.com/bo-miao/RefHuman</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.19599">arXiv:2410.19599</a> <span> [<a href="https://arxiv.org/pdf/2410.19599">pdf</a>, <a href="https://arxiv.org/format/2410.19599">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="General Economics">econ.GN</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Take Caution in Using LLMs as Human Surrogates: Scylla Ex Machina </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yuan Gao</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+D">Dokyun Lee</a>, <a href="/search/cs?searchtype=author&query=Burtch%2C+G">Gordon Burtch</a>, <a href="/search/cs?searchtype=author&query=Fazelpour%2C+S">Sina Fazelpour</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.19599v2-abstract-short" style="display: inline;"> Recent studies suggest large language models (LLMs) can exhibit human-like reasoning, aligning with human behavior in economic experiments, surveys, and political discourse. This has led many to propose that LLMs can be used as surrogates or simulations for humans in social science research. However, LLMs differ fundamentally from humans, relying on probabilistic patterns, absent the embodied expe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19599v2-abstract-full').style.display = 'inline'; document.getElementById('2410.19599v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.19599v2-abstract-full" style="display: none;"> Recent studies suggest large language models (LLMs) can exhibit human-like reasoning, aligning with human behavior in economic experiments, surveys, and political discourse. This has led many to propose that LLMs can be used as surrogates or simulations for humans in social science research. However, LLMs differ fundamentally from humans, relying on probabilistic patterns, absent the embodied experiences or survival objectives that shape human cognition. We assess the reasoning depth of LLMs using the 11-20 money request game. Nearly all advanced approaches fail to replicate human behavior distributions across many models. Causes of failure are diverse and unpredictable, relating to input language, roles, and safeguarding. These results advise caution when using LLMs to study human behavior or as surrogates or simulations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19599v2-abstract-full').style.display = 'none'; document.getElementById('2410.19599v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.19544">arXiv:2410.19544</a> <span> [<a href="https://arxiv.org/pdf/2410.19544">pdf</a>, <a href="https://arxiv.org/format/2410.19544">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> PMM-Net: Single-stage Multi-agent Trajectory Prediction with Patching-based Embedding and Explicit Modal Modulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+H">Huajian Liu</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+W">Wei Dong</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+K">Kunpeng Fan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chao Wang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yongzhuo Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.19544v1-abstract-short" style="display: inline;"> Analyzing and forecasting trajectories of agents like pedestrians plays a pivotal role for embodied intelligent applications. The inherent indeterminacy of human behavior and complex social interaction among a rich variety of agents make this task more challenging than common time-series forecasting. In this letter, we aim to explore a distinct formulation for multi-agent trajectory prediction fra… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19544v1-abstract-full').style.display = 'inline'; document.getElementById('2410.19544v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.19544v1-abstract-full" style="display: none;"> Analyzing and forecasting trajectories of agents like pedestrians plays a pivotal role for embodied intelligent applications. The inherent indeterminacy of human behavior and complex social interaction among a rich variety of agents make this task more challenging than common time-series forecasting. In this letter, we aim to explore a distinct formulation for multi-agent trajectory prediction framework. Specifically, we proposed a patching-based temporal feature extraction module and a graph-based social feature extraction module, enabling effective feature extraction and cross-scenario generalization. Moreover, we reassess the role of social interaction and present a novel method based on explicit modality modulation to integrate temporal and social features, thereby constructing an efficient single-stage inference pipeline. Results on public benchmark datasets demonstrate the superior performance of our model compared with the state-of-the-art methods. The code is available at: github.com/TIB-K330/pmm-net. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19544v1-abstract-full').style.display = 'none'; document.getElementById('2410.19544v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18647">arXiv:2410.18647</a> <span> [<a href="https://arxiv.org/pdf/2410.18647">pdf</a>, <a href="https://arxiv.org/format/2410.18647">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Data Scaling Laws in Imitation Learning for Robotic Manipulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+F">Fanqi Lin</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yingdong Hu</a>, <a href="/search/cs?searchtype=author&query=Sheng%2C+P">Pingyue Sheng</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+C">Chuan Wen</a>, <a href="/search/cs?searchtype=author&query=You%2C+J">Jiacheng You</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yang Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18647v1-abstract-short" style="display: inline;"> Data scaling has revolutionized fields like natural language processing and computer vision, providing models with remarkable generalization capabilities. In this paper, we investigate whether similar data scaling laws exist in robotics, particularly in robotic manipulation, and whether appropriate data scaling can yield single-task robot policies that can be deployed zero-shot for any object with… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18647v1-abstract-full').style.display = 'inline'; document.getElementById('2410.18647v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18647v1-abstract-full" style="display: none;"> Data scaling has revolutionized fields like natural language processing and computer vision, providing models with remarkable generalization capabilities. In this paper, we investigate whether similar data scaling laws exist in robotics, particularly in robotic manipulation, and whether appropriate data scaling can yield single-task robot policies that can be deployed zero-shot for any object within the same category in any environment. To this end, we conduct a comprehensive empirical study on data scaling in imitation learning. By collecting data across numerous environments and objects, we study how a policy's generalization performance changes with the number of training environments, objects, and demonstrations. Throughout our research, we collect over 40,000 demonstrations and execute more than 15,000 real-world robot rollouts under a rigorous evaluation protocol. Our findings reveal several intriguing results: the generalization performance of the policy follows a roughly power-law relationship with the number of environments and objects. The diversity of environments and objects is far more important than the absolute number of demonstrations; once the number of demonstrations per environment or object reaches a certain threshold, additional demonstrations have minimal effect. Based on these insights, we propose an efficient data collection strategy. With four data collectors working for one afternoon, we collect sufficient data to enable the policies for two tasks to achieve approximately 90% success rates in novel environments with unseen objects. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18647v1-abstract-full').style.display = 'none'; document.getElementById('2410.18647v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Gao%2C+Y&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Gao%2C+Y&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Gao%2C+Y&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Gao%2C+Y&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Gao%2C+Y&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Gao%2C+Y&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository