CINXE.COM
Search | arXiv e-print repository
<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 10,712 results for author: <span class="mathjax">Wang, Y</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Wang%2C+Y">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Wang, Y"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Wang%2C+Y&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Wang, Y"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Wang%2C+Y&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Wang%2C+Y&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Wang%2C+Y&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Wang%2C+Y&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Wang%2C+Y&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Wang%2C+Y&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14269">arXiv:2411.14269</a> <span> [<a href="https://arxiv.org/pdf/2411.14269">pdf</a>, <a href="https://arxiv.org/format/2411.14269">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Guided MRI Reconstruction via Schr枚dinger Bridge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yue Wang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+T">Tian Zhou</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+Z">Zhuo-xu Cui</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+B">Bingsheng Huang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+H">Hairong Zheng</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+D">Dong Liang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yanjie Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14269v1-abstract-short" style="display: inline;"> Magnetic Resonance Imaging (MRI) is a multi-contrast imaging technique in which different contrast images share similar structural information. However, conventional diffusion models struggle to effectively leverage this structural similarity. Recently, the Schr枚dinger Bridge (SB), a nonlinear extension of the diffusion model, has been proposed to establish diffusion paths between any distribution… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14269v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14269v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14269v1-abstract-full" style="display: none;"> Magnetic Resonance Imaging (MRI) is a multi-contrast imaging technique in which different contrast images share similar structural information. However, conventional diffusion models struggle to effectively leverage this structural similarity. Recently, the Schr枚dinger Bridge (SB), a nonlinear extension of the diffusion model, has been proposed to establish diffusion paths between any distributions, allowing the incorporation of guided priors. This study proposes an SB-based, multi-contrast image-guided reconstruction framework that establishes a diffusion bridge between the guiding and target image distributions. By using the guiding image along with data consistency during sampling, the target image is reconstructed more accurately. To better address structural differences between images, we introduce an inversion strategy from the field of image editing, termed $\mathbf{I}^2$SB-inversion. Experiments on a paried T1 and T2-FLAIR datasets demonstrate that $\mathbf{I}^2$SB-inversion achieve a high acceleration up to 14.4 and outperforms existing methods in terms of both reconstruction accuracy and stability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14269v1-abstract-full').style.display = 'none'; document.getElementById('2411.14269v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14169">arXiv:2411.14169</a> <span> [<a href="https://arxiv.org/pdf/2411.14169">pdf</a>, <a href="https://arxiv.org/format/2411.14169">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Spatiotemporal Decoupling for Efficient Vision-Based Occupancy Forecasting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jingyi Xu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xieyuanli Chen</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+J">Junyi Ma</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jiawei Huang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jintao Xu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yue Wang</a>, <a href="/search/cs?searchtype=author&query=Pei%2C+L">Ling Pei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14169v1-abstract-short" style="display: inline;"> The task of occupancy forecasting (OCF) involves utilizing past and present perception data to predict future occupancy states of autonomous vehicle surrounding environments, which is critical for downstream tasks such as obstacle avoidance and path planning. Existing 3D OCF approaches struggle to predict plausible spatial details for movable objects and suffer from slow inference speeds due to ne… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14169v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14169v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14169v1-abstract-full" style="display: none;"> The task of occupancy forecasting (OCF) involves utilizing past and present perception data to predict future occupancy states of autonomous vehicle surrounding environments, which is critical for downstream tasks such as obstacle avoidance and path planning. Existing 3D OCF approaches struggle to predict plausible spatial details for movable objects and suffer from slow inference speeds due to neglecting the bias and uneven distribution of changing occupancy states in both space and time. In this paper, we propose a novel spatiotemporal decoupling vision-based paradigm to explicitly tackle the bias and achieve both effective and efficient 3D OCF. To tackle spatial bias in empty areas, we introduce a novel spatial representation that decouples the conventional dense 3D format into 2D bird's-eye view (BEV) occupancy with corresponding height values, enabling 3D OCF derived only from 2D predictions thus enhancing efficiency. To reduce temporal bias on static voxels, we design temporal decoupling to improve end-to-end OCF by temporally associating instances via predicted flows. We develop an efficient multi-head network EfficientOCF to achieve 3D OCF with our devised spatiotemporally decoupled representation. A new metric, conditional IoU (C-IoU), is also introduced to provide a robust 3D OCF performance assessment, especially in datasets with missing or incomplete annotations. The experimental results demonstrate that EfficientOCF surpasses existing baseline methods on accuracy and efficiency, achieving state-of-the-art performance with a fast inference time of 82.33ms with a single GPU. Our code will be released as open source. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14169v1-abstract-full').style.display = 'none'; document.getElementById('2411.14169v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14121">arXiv:2411.14121</a> <span> [<a href="https://arxiv.org/pdf/2411.14121">pdf</a>, <a href="https://arxiv.org/format/2411.14121">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Learning from "Silly" Questions Improves Large Language Models, But Only Slightly </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+T">Tingyuan Zhu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Shudong Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yidong Wang</a>, <a href="/search/cs?searchtype=author&query=Wong%2C+D+F">Derek F. Wong</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+H">Han Yu</a>, <a href="/search/cs?searchtype=author&query=Shinozaki%2C+T">Takahiro Shinozaki</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jindong Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14121v1-abstract-short" style="display: inline;"> Constructing high-quality Supervised Fine-Tuning (SFT) datasets is critical for the training of large language models (LLMs). Recent studies have shown that using data from a specific source, Ruozhiba, a Chinese website where users ask "silly" questions to better understand certain topics, can lead to better fine-tuning performance. This paper aims to explore some hidden factors: the potential int… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14121v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14121v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14121v1-abstract-full" style="display: none;"> Constructing high-quality Supervised Fine-Tuning (SFT) datasets is critical for the training of large language models (LLMs). Recent studies have shown that using data from a specific source, Ruozhiba, a Chinese website where users ask "silly" questions to better understand certain topics, can lead to better fine-tuning performance. This paper aims to explore some hidden factors: the potential interpretations of its success and a large-scale evaluation of the performance. First, we leverage GPT-4 to analyze the successful cases of Ruozhiba questions from the perspective of education, psychology, and cognitive science, deriving a set of explanatory rules. Then, we construct fine-tuning datasets by applying these rules to the MMLU training set. Surprisingly, our results indicate that rules can significantly improve model performance in certain tasks, while potentially diminishing performance on others. For example, SFT data generated following the "Counterintuitive Thinking" rule can achieve approximately a 5% improvement on the "Global Facts" task, whereas the "Blurring the Conceptual Boundaries" rule leads to a performance drop of 6.14% on the "Econometrics" task. In addition, for specific tasks, different rules tend to have a consistent impact on model performance. This suggests that the differences between the extracted rules are not as significant, and the effectiveness of the rules is relatively consistent across tasks. Our research highlights the importance of considering task diversity and rule applicability when constructing SFT datasets to achieve more comprehensive performance improvements. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14121v1-abstract-full').style.display = 'none'; document.getElementById('2411.14121v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">27 pages, 14 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14062">arXiv:2411.14062</a> <span> [<a href="https://arxiv.org/pdf/2411.14062">pdf</a>, <a href="https://arxiv.org/format/2411.14062">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> MMGenBench: Evaluating the Limits of LMMs from the Text-to-Image Generation Perspective </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+H">Hailang Huang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yong Wang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Z">Zixuan Huang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Huaqiu Li</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+T">Tongwen Huang</a>, <a href="/search/cs?searchtype=author&query=Chu%2C+X">Xiangxiang Chu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Richong Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14062v1-abstract-short" style="display: inline;"> Large Multimodal Models (LMMs) have demonstrated remarkable capabilities. While existing benchmarks for evaluating LMMs mainly focus on image comprehension, few works evaluate them from the image generation perspective. To address this issue, we propose a straightforward automated evaluation pipeline. Specifically, this pipeline requires LMMs to generate an image-prompt from a given input image. S… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14062v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14062v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14062v1-abstract-full" style="display: none;"> Large Multimodal Models (LMMs) have demonstrated remarkable capabilities. While existing benchmarks for evaluating LMMs mainly focus on image comprehension, few works evaluate them from the image generation perspective. To address this issue, we propose a straightforward automated evaluation pipeline. Specifically, this pipeline requires LMMs to generate an image-prompt from a given input image. Subsequently, it employs text-to-image generative models to create a new image based on these generated prompts. Finally, we evaluate the performance of LMMs by comparing the original image with the generated one. Furthermore, we introduce MMGenBench-Test, a comprehensive benchmark developed to evaluate LMMs across 13 distinct image patterns, and MMGenBench-Domain, targeting the performance evaluation of LMMs within the generative image domain. A thorough evaluation involving over 50 popular LMMs demonstrates the effectiveness and reliability in both the pipeline and benchmark. Our observations indicate that numerous LMMs excelling in existing benchmarks fail to adequately complete the basic tasks, related to image understanding and description. This finding highlights the substantial potential for performance improvement in current LMMs and suggests avenues for future model optimization. Concurrently, our pipeline facilitates the efficient assessment of LMMs performance across diverse domains by using solely image inputs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14062v1-abstract-full').style.display = 'none'; document.getElementById('2411.14062v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This project is available at: https://github.com/lerogo/MMGenBench</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14046">arXiv:2411.14046</a> <span> [<a href="https://arxiv.org/pdf/2411.14046">pdf</a>, <a href="https://arxiv.org/format/2411.14046">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> REFOL: Resource-Efficient Federated Online Learning for Traffic Flow Forecasting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qingxiang Liu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+S">Sheng Sun</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+Y">Yuxuan Liang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xiaolong Xu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+M">Min Liu</a>, <a href="/search/cs?searchtype=author&query=Bilal%2C+M">Muhammad Bilal</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yuwei Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xujing Li</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Y">Yu Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14046v1-abstract-short" style="display: inline;"> Multiple federated learning (FL) methods are proposed for traffic flow forecasting (TFF) to avoid heavy-transmission and privacy-leaking concerns resulting from the disclosure of raw data in centralized methods. However, these FL methods adopt offline learning which may yield subpar performance, when concept drift occurs, i.e., distributions of historical and future data vary. Online learning can… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14046v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14046v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14046v1-abstract-full" style="display: none;"> Multiple federated learning (FL) methods are proposed for traffic flow forecasting (TFF) to avoid heavy-transmission and privacy-leaking concerns resulting from the disclosure of raw data in centralized methods. However, these FL methods adopt offline learning which may yield subpar performance, when concept drift occurs, i.e., distributions of historical and future data vary. Online learning can detect concept drift during model training, thus more applicable to TFF. Nevertheless, the existing federated online learning method for TFF fails to efficiently solve the concept drift problem and causes tremendous computing and communication overhead. Therefore, we propose a novel method named Resource-Efficient Federated Online Learning (REFOL) for TFF, which guarantees prediction performance in a communication-lightweight and computation-efficient way. Specifically, we design a data-driven client participation mechanism to detect the occurrence of concept drift and determine clients' participation necessity. Subsequently, we propose an adaptive online optimization strategy, which guarantees prediction performance and meanwhile avoids meaningless model updates. Then, a graph convolution-based model aggregation mechanism is designed, aiming to assess participants' contribution based on spatial correlation without importing extra communication and computing consumption on clients. Finally, we conduct extensive experiments on real-world datasets to demonstrate the superiority of REFOL in terms of prediction improvement and resource economization. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14046v1-abstract-full').style.display = 'none'; document.getElementById('2411.14046v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13789">arXiv:2411.13789</a> <span> [<a href="https://arxiv.org/pdf/2411.13789">pdf</a>, <a href="https://arxiv.org/format/2411.13789">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> LEADRE: Multi-Faceted Knowledge Enhanced LLM Empowered Display Advertisement Recommender System </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+F">Fengxin Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yi Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yue Liu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+C">Chao Zhou</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yuan Wang</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+X">Xiaoxiang Deng</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+W">Wei Xue</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+D">Dapeng Liu</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+L">Lei Xiao</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+H">Haijie Gu</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+J">Jie Jiang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Hongyan Liu</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+B">Biao Qin</a>, <a href="/search/cs?searchtype=author&query=He%2C+J">Jun He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13789v1-abstract-short" style="display: inline;"> Display advertising provides significant value to advertisers, publishers, and users. Traditional display advertising systems utilize a multi-stage architecture consisting of retrieval, coarse ranking, and final ranking. However, conventional retrieval methods rely on ID-based learning to rank mechanisms and fail to adequately utilize the content information of ads, which hampers their ability to… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13789v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13789v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13789v1-abstract-full" style="display: none;"> Display advertising provides significant value to advertisers, publishers, and users. Traditional display advertising systems utilize a multi-stage architecture consisting of retrieval, coarse ranking, and final ranking. However, conventional retrieval methods rely on ID-based learning to rank mechanisms and fail to adequately utilize the content information of ads, which hampers their ability to provide diverse recommendation lists. To address this limitation, we propose leveraging the extensive world knowledge of LLMs. However, three key challenges arise when attempting to maximize the effectiveness of LLMs: "How to capture user interests", "How to bridge the knowledge gap between LLMs and advertising system", and "How to efficiently deploy LLMs". To overcome these challenges, we introduce a novel LLM-based framework called LLM Empowered Display ADvertisement REcommender system (LEADRE). LEADRE consists of three core modules: (1) The Intent-Aware Prompt Engineering introduces multi-faceted knowledge and designs intent-aware <Prompt, Response> pairs that fine-tune LLMs to generate ads tailored to users' personal interests. (2) The Advertising-Specific Knowledge Alignment incorporates auxiliary fine-tuning tasks and Direct Preference Optimization (DPO) to align LLMs with ad semantic and business value. (3) The Efficient System Deployment deploys LEADRE in an online environment by integrating both latency-tolerant and latency-sensitive service. Extensive offline experiments demonstrate the effectiveness of LEADRE and validate the contributions of individual modules. Online A/B test shows that LEADRE leads to a 1.57% and 1.17% GMV lift for serviced users on WeChat Channels and Moments separately. LEADRE has been deployed on both platforms, serving tens of billions of requests each day. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13789v1-abstract-full').style.display = 'none'; document.getElementById('2411.13789v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13724">arXiv:2411.13724</a> <span> [<a href="https://arxiv.org/pdf/2411.13724">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Exploring Large Language Models for Climate Forecasting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yang Wang</a>, <a href="/search/cs?searchtype=author&query=Karimi%2C+H+A">Hassan A. Karimi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13724v1-abstract-short" style="display: inline;"> With the increasing impacts of climate change, there is a growing demand for accessible tools that can provide reliable future climate information to support planning, finance, and other decision-making applications. Large language models (LLMs), such as GPT-4, present a promising approach to bridging the gap between complex climate data and the general public, offering a way for non-specialist us… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13724v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13724v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13724v1-abstract-full" style="display: none;"> With the increasing impacts of climate change, there is a growing demand for accessible tools that can provide reliable future climate information to support planning, finance, and other decision-making applications. Large language models (LLMs), such as GPT-4, present a promising approach to bridging the gap between complex climate data and the general public, offering a way for non-specialist users to obtain essential climate insights through natural language interaction. However, an essential challenge remains under-explored: evaluating the ability of LLMs to provide accurate and reliable future climate predictions, which is crucial for applications that rely on anticipating climate trends. In this study, we investigate the capability of GPT-4 in predicting rainfall at short-term (15-day) and long-term (12-month) scales. We designed a series of experiments to assess GPT's performance under different conditions, including scenarios with and without expert data inputs. Our results indicate that GPT, when operating independently, tends to generate conservative forecasts, often reverting to historical averages in the absence of clear trend signals. This study highlights both the potential and challenges of applying LLMs for future climate predictions, providing insights into their integration with climate-related applications and suggesting directions for enhancing their predictive capabilities in the field. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13724v1-abstract-full').style.display = 'none'; document.getElementById('2411.13724v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13691">arXiv:2411.13691</a> <span> [<a href="https://arxiv.org/pdf/2411.13691">pdf</a>, <a href="https://arxiv.org/format/2411.13691">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Retrieval-Augmented Generation for Domain-Specific Question Answering: A Case Study on Pittsburgh and CMU </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+H">Haojia Sun</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yaqi Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shuting Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13691v1-abstract-short" style="display: inline;"> We designed a Retrieval-Augmented Generation (RAG) system to provide large language models with relevant documents for answering domain-specific questions about Pittsburgh and Carnegie Mellon University (CMU). We extracted over 1,800 subpages using a greedy scraping strategy and employed a hybrid annotation process, combining manual and Mistral-generated question-answer pairs, achieving an inter-a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13691v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13691v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13691v1-abstract-full" style="display: none;"> We designed a Retrieval-Augmented Generation (RAG) system to provide large language models with relevant documents for answering domain-specific questions about Pittsburgh and Carnegie Mellon University (CMU). We extracted over 1,800 subpages using a greedy scraping strategy and employed a hybrid annotation process, combining manual and Mistral-generated question-answer pairs, achieving an inter-annotator agreement (IAA) score of 0.7625. Our RAG framework integrates BM25 and FAISS retrievers, enhanced with a reranker for improved document retrieval accuracy. Experimental results show that the RAG system significantly outperforms a non-RAG baseline, particularly in time-sensitive and complex queries, with an F1 score improvement from 5.45% to 42.21% and recall of 56.18%. This study demonstrates the potential of RAG systems in enhancing answer precision and relevance, while identifying areas for further optimization in document retrieval and model training. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13691v1-abstract-full').style.display = 'none'; document.getElementById('2411.13691v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13560">arXiv:2411.13560</a> <span> [<a href="https://arxiv.org/pdf/2411.13560">pdf</a>, <a href="https://arxiv.org/format/2411.13560">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Emerging Technologies">cs.ET</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> AMSnet-KG: A Netlist Dataset for LLM-based AMS Circuit Auto-Design Using Knowledge Graph RAG </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yichen Shi</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+Z">Zhuofu Tao</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yuhao Gao</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+T">Tianjia Zhou</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+C">Cheng Chang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yaxing Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+B">Bingyu Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+G">Genhao Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+A">Alvin Liu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Z">Zhiping Yu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+T">Ting-Jung Lin</a>, <a href="/search/cs?searchtype=author&query=He%2C+L">Lei He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13560v1-abstract-short" style="display: inline;"> High-performance analog and mixed-signal (AMS) circuits are mainly full-custom designed, which is time-consuming and labor-intensive. A significant portion of the effort is experience-driven, which makes the automation of AMS circuit design a formidable challenge. Large language models (LLMs) have emerged as powerful tools for Electronic Design Automation (EDA) applications, fostering advancements… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13560v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13560v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13560v1-abstract-full" style="display: none;"> High-performance analog and mixed-signal (AMS) circuits are mainly full-custom designed, which is time-consuming and labor-intensive. A significant portion of the effort is experience-driven, which makes the automation of AMS circuit design a formidable challenge. Large language models (LLMs) have emerged as powerful tools for Electronic Design Automation (EDA) applications, fostering advancements in the automatic design process for large-scale AMS circuits. However, the absence of high-quality datasets has led to issues such as model hallucination, which undermines the robustness of automatically generated circuit designs. To address this issue, this paper introduces AMSnet-KG, a dataset encompassing various AMS circuit schematics and netlists. We construct a knowledge graph with annotations on detailed functional and performance characteristics. Facilitated by AMSnet-KG, we propose an automated AMS circuit generation framework that utilizes the comprehensive knowledge embedded in LLMs. We first formulate a design strategy (e.g., circuit architecture using a number of circuit components) based on required specifications. Next, matched circuit components are retrieved and assembled into a complete topology, and transistor sizing is obtained through Bayesian optimization. Simulation results of the netlist are fed back to the LLM for further topology refinement, ensuring the circuit design specifications are met. We perform case studies of operational amplifier and comparator design to verify the automatic design flow from specifications to netlists with minimal human effort. The dataset used in this paper will be open-sourced upon publishing of this paper. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13560v1-abstract-full').style.display = 'none'; document.getElementById('2411.13560v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13504">arXiv:2411.13504</a> <span> [<a href="https://arxiv.org/pdf/2411.13504">pdf</a>, <a href="https://arxiv.org/format/2411.13504">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Disentangling Memory and Reasoning Ability in Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jin%2C+M">Mingyu Jin</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+W">Weidi Luo</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+S">Sitao Cheng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xinyi Wang</a>, <a href="/search/cs?searchtype=author&query=Hua%2C+W">Wenyue Hua</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+R">Ruixiang Tang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W+Y">William Yang Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yongfeng Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13504v2-abstract-short" style="display: inline;"> Large Language Models (LLMs) have demonstrated strong performance in handling complex tasks requiring both extensive knowledge and reasoning abilities. However, the existing LLM inference pipeline operates as an opaque process without explicit separation between knowledge retrieval and reasoning steps, making the model's decision-making process unclear and disorganized. This ambiguity can lead to… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13504v2-abstract-full').style.display = 'inline'; document.getElementById('2411.13504v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13504v2-abstract-full" style="display: none;"> Large Language Models (LLMs) have demonstrated strong performance in handling complex tasks requiring both extensive knowledge and reasoning abilities. However, the existing LLM inference pipeline operates as an opaque process without explicit separation between knowledge retrieval and reasoning steps, making the model's decision-making process unclear and disorganized. This ambiguity can lead to issues such as hallucinations and knowledge forgetting, which significantly impact the reliability of LLMs in high-stakes domains. In this paper, we propose a new inference paradigm that decomposes the complex inference process into two distinct and clear actions: (1) memory recall: which retrieves relevant knowledge, and (2) reasoning: which performs logical steps based on the recalled knowledge. To facilitate this decomposition, we introduce two special tokens memory and reason, guiding the model to distinguish between steps that require knowledge retrieval and those that involve reasoning. Our experiment results show that this decomposition not only improves model performance but also enhances the interpretability of the inference process, enabling users to identify sources of error and refine model responses effectively. The code is available at https://github.com/MingyuJ666/Disentangling-Memory-and-Reasoning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13504v2-abstract-full').style.display = 'none'; document.getElementById('2411.13504v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13503">arXiv:2411.13503</a> <span> [<a href="https://arxiv.org/pdf/2411.13503">pdf</a>, <a href="https://arxiv.org/format/2411.13503">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> VBench++: Comprehensive and Versatile Benchmark Suite for Video Generative Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+Z">Ziqi Huang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+F">Fan Zhang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xiaojie Xu</a>, <a href="/search/cs?searchtype=author&query=He%2C+Y">Yinan He</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+J">Jiashuo Yu</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Z">Ziyue Dong</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Q">Qianli Ma</a>, <a href="/search/cs?searchtype=author&query=Chanpaisit%2C+N">Nattapol Chanpaisit</a>, <a href="/search/cs?searchtype=author&query=Si%2C+C">Chenyang Si</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Y">Yuming Jiang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yaohui Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xinyuan Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Ying-Cong Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Limin Wang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+D">Dahua Lin</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+Y">Yu Qiao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Ziwei Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13503v1-abstract-short" style="display: inline;"> Video generation has witnessed significant advancements, yet evaluating these models remains a challenge. A comprehensive evaluation benchmark for video generation is indispensable for two reasons: 1) Existing metrics do not fully align with human perceptions; 2) An ideal evaluation system should provide insights to inform future developments of video generation. To this end, we present VBench, a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13503v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13503v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13503v1-abstract-full" style="display: none;"> Video generation has witnessed significant advancements, yet evaluating these models remains a challenge. A comprehensive evaluation benchmark for video generation is indispensable for two reasons: 1) Existing metrics do not fully align with human perceptions; 2) An ideal evaluation system should provide insights to inform future developments of video generation. To this end, we present VBench, a comprehensive benchmark suite that dissects "video generation quality" into specific, hierarchical, and disentangled dimensions, each with tailored prompts and evaluation methods. VBench has several appealing properties: 1) Comprehensive Dimensions: VBench comprises 16 dimensions in video generation (e.g., subject identity inconsistency, motion smoothness, temporal flickering, and spatial relationship, etc). The evaluation metrics with fine-grained levels reveal individual models' strengths and weaknesses. 2) Human Alignment: We also provide a dataset of human preference annotations to validate our benchmarks' alignment with human perception, for each evaluation dimension respectively. 3) Valuable Insights: We look into current models' ability across various evaluation dimensions, and various content types. We also investigate the gaps between video and image generation models. 4) Versatile Benchmarking: VBench++ supports evaluating text-to-video and image-to-video. We introduce a high-quality Image Suite with an adaptive aspect ratio to enable fair evaluations across different image-to-video generation settings. Beyond assessing technical quality, VBench++ evaluates the trustworthiness of video generative models, providing a more holistic view of model performance. 5) Full Open-Sourcing: We fully open-source VBench++ and continually add new video generation models to our leaderboard to drive forward the field of video generation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13503v1-abstract-full').style.display = 'none'; document.getElementById('2411.13503v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Leaderboard: https://huggingface.co/spaces/Vchitect/VBench_Leaderboard Code: https://github.com/Vchitect/VBench Project page: https://vchitect.github.io/VBench-project/ extension of arXiv:2311.17982. arXiv admin note: substantial text overlap with arXiv:2311.17982</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13322">arXiv:2411.13322</a> <span> [<a href="https://arxiv.org/pdf/2411.13322">pdf</a>, <a href="https://arxiv.org/format/2411.13322">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Scaling Laws for Online Advertisement Retrieval </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yunli Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zixuan Yang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhen Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhiqiang Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jian Yang</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+S">Shiyang Wen</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+P">Peng Jiang</a>, <a href="/search/cs?searchtype=author&query=Gai%2C+K">Kun Gai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13322v1-abstract-short" style="display: inline;"> The scaling law is a notable property of neural network models and has significantly propelled the development of large language models. Scaling laws hold great promise in guiding model design and resource allocation. Recent research increasingly shows that scaling laws are not limited to NLP tasks or Transformer architectures; they also apply to domains such as recommendation. However, there is s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13322v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13322v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13322v1-abstract-full" style="display: none;"> The scaling law is a notable property of neural network models and has significantly propelled the development of large language models. Scaling laws hold great promise in guiding model design and resource allocation. Recent research increasingly shows that scaling laws are not limited to NLP tasks or Transformer architectures; they also apply to domains such as recommendation. However, there is still a lack of literature on scaling law research in online advertisement retrieval systems. This may be because 1) identifying the scaling law for resource cost and online revenue is often expensive in both time and training resources for large-scale industrial applications, and 2) varying settings for different systems prevent the scaling law from being applied across various scenarios. To address these issues, we propose a lightweight paradigm to identify the scaling law of online revenue and machine cost for a certain online advertisement retrieval scenario with a low experimental cost. Specifically, we focus on a sole factor (FLOPs) and propose an offline metric named R/R* that exhibits a high linear correlation with online revenue for retrieval models. We estimate the machine cost offline via a simulation algorithm. Thus, we can transform most online experiments into low-cost offline experiments. We conduct comprehensive experiments to verify the effectiveness of our proposed metric R/R* and to identify the scaling law in the online advertisement retrieval system of Kuaishou. With the scaling law, we demonstrate practical applications for ROI-constrained model designing and multi-scenario resource allocation in Kuaishou advertising system. To the best of our knowledge, this is the first work to study the scaling laws for online advertisement retrieval of real-world systems, showing great potential for scaling law in advertising system optimization. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13322v1-abstract-full').style.display = 'none'; document.getElementById('2411.13322v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 8 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13243">arXiv:2411.13243</a> <span> [<a href="https://arxiv.org/pdf/2411.13243">pdf</a>, <a href="https://arxiv.org/format/2411.13243">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> XMask3D: Cross-modal Mask Reasoning for Open Vocabulary 3D Semantic Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Ziyi Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yanbo Wang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+X">Xumin Yu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jie Zhou</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+J">Jiwen Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13243v1-abstract-short" style="display: inline;"> Existing methodologies in open vocabulary 3D semantic segmentation primarily concentrate on establishing a unified feature space encompassing 3D, 2D, and textual modalities. Nevertheless, traditional techniques such as global feature alignment or vision-language model distillation tend to impose only approximate correspondence, struggling notably with delineating fine-grained segmentation boundari… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13243v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13243v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13243v1-abstract-full" style="display: none;"> Existing methodologies in open vocabulary 3D semantic segmentation primarily concentrate on establishing a unified feature space encompassing 3D, 2D, and textual modalities. Nevertheless, traditional techniques such as global feature alignment or vision-language model distillation tend to impose only approximate correspondence, struggling notably with delineating fine-grained segmentation boundaries. To address this gap, we propose a more meticulous mask-level alignment between 3D features and the 2D-text embedding space through a cross-modal mask reasoning framework, XMask3D. In our approach, we developed a mask generator based on the denoising UNet from a pre-trained diffusion model, leveraging its capability for precise textual control over dense pixel representations and enhancing the open-world adaptability of the generated masks. We further integrate 3D global features as implicit conditions into the pre-trained 2D denoising UNet, enabling the generation of segmentation masks with additional 3D geometry awareness. Subsequently, the generated 2D masks are employed to align mask-level 3D representations with the vision-language feature space, thereby augmenting the open vocabulary capability of 3D geometry embeddings. Finally, we fuse complementary 2D and 3D mask features, resulting in competitive performance across multiple benchmarks for 3D open vocabulary semantic segmentation. Code is available at https://github.com/wangzy22/XMask3D. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13243v1-abstract-full').style.display = 'none'; document.getElementById('2411.13243v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13239">arXiv:2411.13239</a> <span> [<a href="https://arxiv.org/pdf/2411.13239">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Emerging Technologies">cs.ET</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> </div> <p class="title is-5 mathjax"> Transforming the Hybrid Cloud for Emerging AI Workloads </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+D">Deming Chen</a>, <a href="/search/cs?searchtype=author&query=Youssef%2C+A">Alaa Youssef</a>, <a href="/search/cs?searchtype=author&query=Pendse%2C+R">Ruchi Pendse</a>, <a href="/search/cs?searchtype=author&query=Schleife%2C+A">Andr茅 Schleife</a>, <a href="/search/cs?searchtype=author&query=Clark%2C+B+K">Bryan K. Clark</a>, <a href="/search/cs?searchtype=author&query=Hamann%2C+H">Hendrik Hamann</a>, <a href="/search/cs?searchtype=author&query=He%2C+J">Jingrui He</a>, <a href="/search/cs?searchtype=author&query=Laino%2C+T">Teodoro Laino</a>, <a href="/search/cs?searchtype=author&query=Varshney%2C+L">Lav Varshney</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yuxiong Wang</a>, <a href="/search/cs?searchtype=author&query=Sil%2C+A">Avirup Sil</a>, <a href="/search/cs?searchtype=author&query=Jabbarvand%2C+R">Reyhaneh Jabbarvand</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+T">Tianyin Xu</a>, <a href="/search/cs?searchtype=author&query=Kindratenko%2C+V">Volodymyr Kindratenko</a>, <a href="/search/cs?searchtype=author&query=Costa%2C+C">Carlos Costa</a>, <a href="/search/cs?searchtype=author&query=Adve%2C+S">Sarita Adve</a>, <a href="/search/cs?searchtype=author&query=Mendis%2C+C">Charith Mendis</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Minjia Zhang</a>, <a href="/search/cs?searchtype=author&query=N%C3%BA%C3%B1ez-Corrales%2C+S">Santiago N煤帽ez-Corrales</a>, <a href="/search/cs?searchtype=author&query=Ganti%2C+R">Raghu Ganti</a>, <a href="/search/cs?searchtype=author&query=Srivatsa%2C+M">Mudhakar Srivatsa</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+N+S">Nam Sung Kim</a>, <a href="/search/cs?searchtype=author&query=Torrellas%2C+J">Josep Torrellas</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jian Huang</a>, <a href="/search/cs?searchtype=author&query=Seelam%2C+S">Seetharami Seelam</a> , et al. (19 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13239v1-abstract-short" style="display: inline;"> This white paper, developed through close collaboration between IBM Research and UIUC researchers within the IIDAI Institute, envisions transforming hybrid cloud systems to meet the growing complexity of AI workloads through innovative, full-stack co-design approaches, emphasizing usability, manageability, affordability, adaptability, efficiency, and scalability. By integrating cutting-edge techno… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13239v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13239v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13239v1-abstract-full" style="display: none;"> This white paper, developed through close collaboration between IBM Research and UIUC researchers within the IIDAI Institute, envisions transforming hybrid cloud systems to meet the growing complexity of AI workloads through innovative, full-stack co-design approaches, emphasizing usability, manageability, affordability, adaptability, efficiency, and scalability. By integrating cutting-edge technologies such as generative and agentic AI, cross-layer automation and optimization, unified control plane, and composable and adaptive system architecture, the proposed framework addresses critical challenges in energy efficiency, performance, and cost-effectiveness. Incorporating quantum computing as it matures will enable quantum-accelerated simulations for materials science, climate modeling, and other high-impact domains. Collaborative efforts between academia and industry are central to this vision, driving advancements in foundation models for material design and climate solutions, scalable multimodal data processing, and enhanced physics-based AI emulators for applications like weather forecasting and carbon sequestration. Research priorities include advancing AI agentic systems, LLM as an Abstraction (LLMaaA), AI model optimization and unified abstractions across heterogeneous infrastructure, end-to-end edge-cloud transformation, efficient programming model, middleware and platform, secure infrastructure, application-adaptive cloud systems, and new quantum-classical collaborative workflows. These ideas and solutions encompass both theoretical and practical research questions, requiring coordinated input and support from the research community. This joint initiative aims to establish hybrid clouds as secure, efficient, and sustainable platforms, fostering breakthroughs in AI-driven applications and scientific discovery across academia, industry, and society. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13239v1-abstract-full').style.display = 'none'; document.getElementById('2411.13239v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">70 pages, 27 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13079">arXiv:2411.13079</a> <span> [<a href="https://arxiv.org/pdf/2411.13079">pdf</a>, <a href="https://arxiv.org/format/2411.13079">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Neural Internal Model Control: Learning a Robust Control Policy via Predictive Error Feedback </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gao%2C+F">Feng Gao</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+C">Chao Yu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yu Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yi Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13079v1-abstract-short" style="display: inline;"> Accurate motion control in the face of disturbances within complex environments remains a major challenge in robotics. Classical model-based approaches often struggle with nonlinearities and unstructured disturbances, while RL-based methods can be fragile when encountering unseen scenarios. In this paper, we propose a novel framework, Neural Internal Model Control, which integrates model-based con… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13079v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13079v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13079v1-abstract-full" style="display: none;"> Accurate motion control in the face of disturbances within complex environments remains a major challenge in robotics. Classical model-based approaches often struggle with nonlinearities and unstructured disturbances, while RL-based methods can be fragile when encountering unseen scenarios. In this paper, we propose a novel framework, Neural Internal Model Control, which integrates model-based control with RL-based control to enhance robustness. Our framework streamlines the predictive model by applying Newton-Euler equations for rigid-body dynamics, eliminating the need to capture complex high-dimensional nonlinearities. This internal model combines model-free RL algorithms with predictive error feedback. Such a design enables a closed-loop control structure to enhance the robustness and generalizability of the control system. We demonstrate the effectiveness of our framework on both quadrotors and quadrupedal robots, achieving superior performance compared to state-of-the-art methods. Furthermore, real-world deployment on a quadrotor with rope-suspended payloads highlights the framework's robustness in sim-to-real transfer. Our code is released at https://github.com/thu-uav/NeuralIMC. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13079v1-abstract-full').style.display = 'none'; document.getElementById('2411.13079v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to RAL</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12992">arXiv:2411.12992</a> <span> [<a href="https://arxiv.org/pdf/2411.12992">pdf</a>, <a href="https://arxiv.org/format/2411.12992">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> MemoryFormer: Minimize Transformer Computation by Removing Fully-Connected Layers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ding%2C+N">Ning Ding</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Y">Yehui Tang</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+H">Haochen Qin</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Zhenli Zhou</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+C">Chao Xu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Lin Li</a>, <a href="/search/cs?searchtype=author&query=Han%2C+K">Kai Han</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+H">Heng Liao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yunhe Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12992v1-abstract-short" style="display: inline;"> In order to reduce the computational complexity of large language models, great efforts have been made to to improve the efficiency of transformer models such as linear attention and flash-attention. However, the model size and corresponding computational complexity are constantly scaled up in pursuit of higher performance. In this work, we present MemoryFormer, a novel transformer architecture wh… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12992v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12992v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12992v1-abstract-full" style="display: none;"> In order to reduce the computational complexity of large language models, great efforts have been made to to improve the efficiency of transformer models such as linear attention and flash-attention. However, the model size and corresponding computational complexity are constantly scaled up in pursuit of higher performance. In this work, we present MemoryFormer, a novel transformer architecture which significantly reduces the computational complexity (FLOPs) from a new perspective. We eliminate nearly all the computations of the transformer model except for the necessary computation required by the multi-head attention operation. This is made possible by utilizing an alternative method for feature transformation to replace the linear projection of fully-connected layers. Specifically, we first construct a group of in-memory lookup tables that store a large amount of discrete vectors to replace the weight matrix used in linear projection. We then use a hash algorithm to retrieve a correlated subset of vectors dynamically based on the input embedding. The retrieved vectors combined together will form the output embedding, which provides an estimation of the result of matrix multiplication operation in a fully-connected layer. Compared to conducting matrix multiplication, retrieving data blocks from memory is a much cheaper operation which requires little computations. We train MemoryFormer from scratch and conduct extensive experiments on various benchmarks to demonstrate the effectiveness of the proposed model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12992v1-abstract-full').style.display = 'none'; document.getElementById('2411.12992v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12713">arXiv:2411.12713</a> <span> [<a href="https://arxiv.org/pdf/2411.12713">pdf</a>, <a href="https://arxiv.org/format/2411.12713">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> CATCH: Complementary Adaptive Token-level Contrastive Decoding to Mitigate Hallucinations in LVLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kan%2C+Z">Zhehan Kan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Ce Zhang</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+Z">Zihan Liao</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+Y">Yapeng Tian</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+W">Wenming Yang</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+J">Junyuan Xiao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xu Li</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+D">Dongmei Jiang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yaowei Wang</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+Q">Qingmin Liao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12713v1-abstract-short" style="display: inline;"> Large Vision-Language Model (LVLM) systems have demonstrated impressive vision-language reasoning capabilities but suffer from pervasive and severe hallucination issues, posing significant risks in critical domains such as healthcare and autonomous systems. Despite previous efforts to mitigate hallucinations, a persistent issue remains: visual defect from vision-language misalignment, creating a b… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12713v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12713v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12713v1-abstract-full" style="display: none;"> Large Vision-Language Model (LVLM) systems have demonstrated impressive vision-language reasoning capabilities but suffer from pervasive and severe hallucination issues, posing significant risks in critical domains such as healthcare and autonomous systems. Despite previous efforts to mitigate hallucinations, a persistent issue remains: visual defect from vision-language misalignment, creating a bottleneck in visual processing capacity. To address this challenge, we develop Complementary Adaptive Token-level Contrastive Decoding to Mitigate Hallucinations in LVLMs (CATCH), based on the Information Bottleneck theory. CATCH introduces Complementary Visual Decoupling (CVD) for visual information separation, Non-Visual Screening (NVS) for hallucination detection, and Adaptive Token-level Contrastive Decoding (ATCD) for hallucination mitigation. CATCH addresses issues related to visual defects that cause diminished fine-grained feature perception and cumulative hallucinations in open-ended scenarios. It is applicable to various visual question-answering tasks without requiring any specific data or prior knowledge, and generalizes robustly to new tasks without additional training, opening new possibilities for advancing LVLM in various challenging applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12713v1-abstract-full').style.display = 'none'; document.getElementById('2411.12713v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12711">arXiv:2411.12711</a> <span> [<a href="https://arxiv.org/pdf/2411.12711">pdf</a>, <a href="https://arxiv.org/format/2411.12711">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> UBSoft: A Simulation Platform for Robotic Skill Learning in Unbounded Soft Environments </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+C">Chunru Lin</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+J">Jugang Fan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yian Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zeyuan Yang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhehuan Chen</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+L">Lixing Fang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+T">Tsun-Hsuan Wang</a>, <a href="/search/cs?searchtype=author&query=Xian%2C+Z">Zhou Xian</a>, <a href="/search/cs?searchtype=author&query=Gan%2C+C">Chuang Gan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12711v1-abstract-short" style="display: inline;"> It is desired to equip robots with the capability of interacting with various soft materials as they are ubiquitous in the real world. While physics simulations are one of the predominant methods for data collection and robot training, simulating soft materials presents considerable challenges. Specifically, it is significantly more costly than simulating rigid objects in terms of simulation speed… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12711v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12711v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12711v1-abstract-full" style="display: none;"> It is desired to equip robots with the capability of interacting with various soft materials as they are ubiquitous in the real world. While physics simulations are one of the predominant methods for data collection and robot training, simulating soft materials presents considerable challenges. Specifically, it is significantly more costly than simulating rigid objects in terms of simulation speed and storage requirements. These limitations typically restrict the scope of studies on soft materials to small and bounded areas, thereby hindering the learning of skills in broader spaces. To address this issue, we introduce UBSoft, a new simulation platform designed to support unbounded soft environments for robot skill acquisition. Our platform utilizes spatially adaptive resolution scales, where simulation resolution dynamically adjusts based on proximity to active robotic agents. Our framework markedly reduces the demand for extensive storage space and computation costs required for large-scale scenarios involving soft materials. We also establish a set of benchmark tasks in our platform, including both locomotion and manipulation tasks, and conduct experiments to evaluate the efficacy of various reinforcement learning algorithms and trajectory optimization techniques, both gradient-based and sampling-based. Preliminary results indicate that sampling-based trajectory optimization generally achieves better results for obtaining one trajectory to solve the task. Additionally, we conduct experiments in real-world environments to demonstrate that advancements made in our UBSoft simulator could translate to improved robot interactions with large-scale soft material. More videos can be found at https://vis-www.cs.umass.edu/ubsoft/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12711v1-abstract-full').style.display = 'none'; document.getElementById('2411.12711v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CoRL 2024. The first two authors contributed equally to this paper</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12338">arXiv:2411.12338</a> <span> [<a href="https://arxiv.org/pdf/2411.12338">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Target Height Estimation Using a Single Acoustic Camera for Compensation in 2D Seabed Mosaicking </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+X">Xiaoteng Zhou</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yusheng Wang</a>, <a href="/search/cs?searchtype=author&query=Mizuno%2C+K">Katsunori Mizuno</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12338v1-abstract-short" style="display: inline;"> This letter proposes a novel approach for compensating target height data in 2D seabed mosaicking for low-visibility underwater perception. Acoustic cameras are effective sensors for sensing the marine environments due to their high-resolution imaging capabilities and robustness to darkness and turbidity. However, the loss of elevation angle during the imaging process results in a lack of target h… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12338v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12338v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12338v1-abstract-full" style="display: none;"> This letter proposes a novel approach for compensating target height data in 2D seabed mosaicking for low-visibility underwater perception. Acoustic cameras are effective sensors for sensing the marine environments due to their high-resolution imaging capabilities and robustness to darkness and turbidity. However, the loss of elevation angle during the imaging process results in a lack of target height information in the original acoustic camera images, leading to a simplistic 2D representation of the seabed mosaicking. In perceiving cluttered and unexplored marine environments, target height data is crucial for avoiding collisions with marine robots. This study proposes a novel approach for estimating seabed target height using a single acoustic camera and integrates height data into 2D seabed mosaicking to compensate for the missing 3D dimension of seabed targets. Unlike classic methods that model the loss of elevation angle to achieve seabed 3D reconstruction, this study focuses on utilizing available acoustic cast shadow clues and simple sensor motion to quickly estimate target height. The feasibility of our proposal is verified through a water tank experiment and a simulation experiment. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12338v1-abstract-full').style.display = 'none'; document.getElementById('2411.12338v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages,conference</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12331">arXiv:2411.12331</a> <span> [<a href="https://arxiv.org/pdf/2411.12331">pdf</a>, <a href="https://arxiv.org/format/2411.12331">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Accelerating UMAP for Large-Scale Datasets Through Spectral Coarsening </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yongyu Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12331v1-abstract-short" style="display: inline;"> This paper introduces an innovative approach to dramatically accelerate UMAP using spectral data compression.The proposed method significantly reduces the size of the dataset, preserving its essential manifold structure through an advanced spectral compression technique. This allows UMAP to perform much faster while maintaining the quality of its embeddings. Experiments on real-world datasets, suc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12331v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12331v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12331v1-abstract-full" style="display: none;"> This paper introduces an innovative approach to dramatically accelerate UMAP using spectral data compression.The proposed method significantly reduces the size of the dataset, preserving its essential manifold structure through an advanced spectral compression technique. This allows UMAP to perform much faster while maintaining the quality of its embeddings. Experiments on real-world datasets, such as USPS, demonstrate the method's ability to achieve substantial data reduction without compromising embedding fidelity. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12331v1-abstract-full').style.display = 'none'; document.getElementById('2411.12331v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11921">arXiv:2411.11921</a> <span> [<a href="https://arxiv.org/pdf/2411.11921">pdf</a>, <a href="https://arxiv.org/format/2411.11921">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> DeSiRe-GS: 4D Street Gaussians for Static-Dynamic Decomposition and Surface Reconstruction for Urban Driving Scenes </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Peng%2C+C">Chensheng Peng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chengwei Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yixiao Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+C">Chenfeng Xu</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+Y">Yichen Xie</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+W">Wenzhao Zheng</a>, <a href="/search/cs?searchtype=author&query=Keutzer%2C+K">Kurt Keutzer</a>, <a href="/search/cs?searchtype=author&query=Tomizuka%2C+M">Masayoshi Tomizuka</a>, <a href="/search/cs?searchtype=author&query=Zhan%2C+W">Wei Zhan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11921v1-abstract-short" style="display: inline;"> We present DeSiRe-GS, a self-supervised gaussian splatting representation, enabling effective static-dynamic decomposition and high-fidelity surface reconstruction in complex driving scenarios. Our approach employs a two-stage optimization pipeline of dynamic street Gaussians. In the first stage, we extract 2D motion masks based on the observation that 3D Gaussian Splatting inherently can reconstr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11921v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11921v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11921v1-abstract-full" style="display: none;"> We present DeSiRe-GS, a self-supervised gaussian splatting representation, enabling effective static-dynamic decomposition and high-fidelity surface reconstruction in complex driving scenarios. Our approach employs a two-stage optimization pipeline of dynamic street Gaussians. In the first stage, we extract 2D motion masks based on the observation that 3D Gaussian Splatting inherently can reconstruct only the static regions in dynamic environments. These extracted 2D motion priors are then mapped into the Gaussian space in a differentiable manner, leveraging an efficient formulation of dynamic Gaussians in the second stage. Combined with the introduced geometric regularizations, our method are able to address the over-fitting issues caused by data sparsity in autonomous driving, reconstructing physically plausible Gaussians that align with object surfaces rather than floating in air. Furthermore, we introduce temporal cross-view consistency to ensure coherence across time and viewpoints, resulting in high-quality surface reconstruction. Comprehensive experiments demonstrate the efficiency and effectiveness of DeSiRe-GS, surpassing prior self-supervised arts and achieving accuracy comparable to methods relying on external 3D bounding box annotations. Code is available at \url{https://github.com/chengweialan/DeSiRe-GS} <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11921v1-abstract-full').style.display = 'none'; document.getElementById('2411.11921v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11906">arXiv:2411.11906</a> <span> [<a href="https://arxiv.org/pdf/2411.11906">pdf</a>, <a href="https://arxiv.org/format/2411.11906">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> $\text{S}^{3}$Mamba: Arbitrary-Scale Super-Resolution via Scaleable State Space Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xia%2C+P">Peizhe Xia</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+L">Long Peng</a>, <a href="/search/cs?searchtype=author&query=Di%2C+X">Xin Di</a>, <a href="/search/cs?searchtype=author&query=Pei%2C+R">Renjing Pei</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yang Wang</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+Y">Yang Cao</a>, <a href="/search/cs?searchtype=author&query=Zha%2C+Z">Zheng-Jun Zha</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11906v1-abstract-short" style="display: inline;"> Arbitrary scale super-resolution (ASSR) aims to super-resolve low-resolution images to high-resolution images at any scale using a single model, addressing the limitations of traditional super-resolution methods that are restricted to fixed-scale factors (e.g., $\times2$, $\times4$). The advent of Implicit Neural Representations (INR) has brought forth a plethora of novel methodologies for ASSR, w… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11906v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11906v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11906v1-abstract-full" style="display: none;"> Arbitrary scale super-resolution (ASSR) aims to super-resolve low-resolution images to high-resolution images at any scale using a single model, addressing the limitations of traditional super-resolution methods that are restricted to fixed-scale factors (e.g., $\times2$, $\times4$). The advent of Implicit Neural Representations (INR) has brought forth a plethora of novel methodologies for ASSR, which facilitate the reconstruction of original continuous signals by modeling a continuous representation space for coordinates and pixel values, thereby enabling arbitrary-scale super-resolution. Consequently, the primary objective of ASSR is to construct a continuous representation space derived from low-resolution inputs. However, existing methods, primarily based on CNNs and Transformers, face significant challenges such as high computational complexity and inadequate modeling of long-range dependencies, which hinder their effectiveness in real-world applications. To overcome these limitations, we propose a novel arbitrary-scale super-resolution method, called $\text{S}^{3}$Mamba, to construct a scalable continuous representation space. Specifically, we propose a Scalable State Space Model (SSSM) to modulate the state transition matrix and the sampling matrix of step size during the discretization process, achieving scalable and continuous representation modeling with linear computational complexity. Additionally, we propose a novel scale-aware self-attention mechanism to further enhance the network's ability to perceive global important features at different scales, thereby building the $\text{S}^{3}$Mamba to achieve superior arbitrary-scale super-resolution. Extensive experiments on both synthetic and real-world benchmarks demonstrate that our method achieves state-of-the-art performance and superior generalization capabilities at arbitrary super-resolution scales. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11906v1-abstract-full').style.display = 'none'; document.getElementById('2411.11906v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11745">arXiv:2411.11745</a> <span> [<a href="https://arxiv.org/pdf/2411.11745">pdf</a>, <a href="https://arxiv.org/format/2411.11745">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> </div> </div> <p class="title is-5 mathjax"> BitMoD: Bit-serial Mixture-of-Datatype LLM Acceleration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yuzong Chen</a>, <a href="/search/cs?searchtype=author&query=AbouElhamayed%2C+A+F">Ahmed F. AbouElhamayed</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+X">Xilai Dai</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yang Wang</a>, <a href="/search/cs?searchtype=author&query=Andronic%2C+M">Marta Andronic</a>, <a href="/search/cs?searchtype=author&query=Constantinides%2C+G+A">George A. Constantinides</a>, <a href="/search/cs?searchtype=author&query=Abdelfattah%2C+M+S">Mohamed S. Abdelfattah</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11745v1-abstract-short" style="display: inline;"> Large language models (LLMs) have demonstrated remarkable performance across various machine learning tasks. Yet the substantial memory footprint of LLMs significantly hinders their deployment. In this paper, we improve the accessibility of LLMs through BitMoD, an algorithm-hardware co-design solution that enables efficient LLM acceleration at low weight precision. On the algorithm side, BitMoD in… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11745v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11745v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11745v1-abstract-full" style="display: none;"> Large language models (LLMs) have demonstrated remarkable performance across various machine learning tasks. Yet the substantial memory footprint of LLMs significantly hinders their deployment. In this paper, we improve the accessibility of LLMs through BitMoD, an algorithm-hardware co-design solution that enables efficient LLM acceleration at low weight precision. On the algorithm side, BitMoD introduces fine-grained data type adaptation that uses a different numerical data type to quantize a group of (e.g., 128) weights. Through the careful design of these new data types, BitMoD is able to quantize LLM weights to very low precision (e.g., 4 bits and 3 bits) while maintaining high accuracy. On the hardware side, BitMoD employs a bit-serial processing element to easily support multiple numerical precisions and data types; our hardware design includes two key innovations: First, it employs a unified representation to process different weight data types, thus reducing the hardware cost. Second, it adopts a bit-serial dequantization unit to rescale the per-group partial sum with minimal hardware overhead. Our evaluation on six representative LLMs demonstrates that BitMoD significantly outperforms state-of-the-art LLM quantization and acceleration methods. For discriminative tasks, BitMoD can quantize LLM weights to 4-bit with $<\!0.5\%$ accuracy loss on average. For generative tasks, BitMoD is able to quantize LLM weights to 3-bit while achieving better perplexity than prior LLM quantization scheme. Combining the superior model performance with an efficient accelerator design, BitMoD achieves an average of $1.69\times$ and $1.48\times$ speedups compared to prior LLM accelerators ANT and OliVe, respectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11745v1-abstract-full').style.display = 'none'; document.getElementById('2411.11745v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">HPCA 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11697">arXiv:2411.11697</a> <span> [<a href="https://arxiv.org/pdf/2411.11697">pdf</a>, <a href="https://arxiv.org/format/2411.11697">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Robust Reinforcement Learning under Diffusion Models for Data with Jumps </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+C">Chenyang Jiang</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+D">Donggyu Kim</a>, <a href="/search/cs?searchtype=author&query=Quintos%2C+A">Alejandra Quintos</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yazhen Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11697v1-abstract-short" style="display: inline;"> Reinforcement Learning (RL) has proven effective in solving complex decision-making tasks across various domains, but challenges remain in continuous-time settings, particularly when state dynamics are governed by stochastic differential equations (SDEs) with jump components. In this paper, we address this challenge by introducing the Mean-Square Bipower Variation Error (MSBVE) algorithm, which en… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11697v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11697v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11697v1-abstract-full" style="display: none;"> Reinforcement Learning (RL) has proven effective in solving complex decision-making tasks across various domains, but challenges remain in continuous-time settings, particularly when state dynamics are governed by stochastic differential equations (SDEs) with jump components. In this paper, we address this challenge by introducing the Mean-Square Bipower Variation Error (MSBVE) algorithm, which enhances robustness and convergence in scenarios involving significant stochastic noise and jumps. We first revisit the Mean-Square TD Error (MSTDE) algorithm, commonly used in continuous-time RL, and highlight its limitations in handling jumps in state dynamics. The proposed MSBVE algorithm minimizes the mean-square quadratic variation error, offering improved performance over MSTDE in environments characterized by SDEs with jumps. Simulations and formal proofs demonstrate that the MSBVE algorithm reliably estimates the value function in complex settings, surpassing MSTDE's performance when faced with jump processes. These findings underscore the importance of alternative error metrics to improve the resilience and effectiveness of RL algorithms in continuous-time frameworks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11697v1-abstract-full').style.display = 'none'; document.getElementById('2411.11697v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11683">arXiv:2411.11683</a> <span> [<a href="https://arxiv.org/pdf/2411.11683">pdf</a>, <a href="https://arxiv.org/format/2411.11683">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> TrojanRobot: Backdoor Attacks Against Robotic Manipulation in the Physical World </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xianlong Wang</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+H">Hewen Pan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hangtao Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Minghui Li</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+S">Shengshan Hu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Ziqi Zhou</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+L">Lulu Xue</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+P">Peijin Guo</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yichen Wang</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+W">Wei Wan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+A">Aishan Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L+Y">Leo Yu Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11683v1-abstract-short" style="display: inline;"> Robotic manipulation refers to the autonomous handling and interaction of robots with objects using advanced techniques in robotics and artificial intelligence. The advent of powerful tools such as large language models (LLMs) and large vision-language models (LVLMs) has significantly enhanced the capabilities of these robots in environmental perception and decision-making. However, the introducti… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11683v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11683v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11683v1-abstract-full" style="display: none;"> Robotic manipulation refers to the autonomous handling and interaction of robots with objects using advanced techniques in robotics and artificial intelligence. The advent of powerful tools such as large language models (LLMs) and large vision-language models (LVLMs) has significantly enhanced the capabilities of these robots in environmental perception and decision-making. However, the introduction of these intelligent agents has led to security threats such as jailbreak attacks and adversarial attacks. In this research, we take a further step by proposing a backdoor attack specifically targeting robotic manipulation and, for the first time, implementing backdoor attack in the physical world. By embedding a backdoor visual language model into the visual perception module within the robotic system, we successfully mislead the robotic arm's operation in the physical world, given the presence of common items as triggers. Experimental evaluations in the physical world demonstrate the effectiveness of the proposed backdoor attack. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11683v1-abstract-full').style.display = 'none'; document.getElementById('2411.11683v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Initial version with preliminary results. We welcome any feedback or suggestions</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11614">arXiv:2411.11614</a> <span> [<a href="https://arxiv.org/pdf/2411.11614">pdf</a>, <a href="https://arxiv.org/format/2411.11614">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantum Physics">quant-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Statistics Theory">math.ST</span> </div> </div> <p class="title is-5 mathjax"> On the physics of nested Markov models: a generalized probabilistic theory perspective </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xingjian Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yuhao Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11614v1-abstract-short" style="display: inline;"> Determining potential probability distributions with a given causal graph is vital for causality studies. To bypass the difficulty in characterizing latent variables in a Bayesian network, the nested Markov model provides an elegant algebraic approach by listing exactly all the equality constraints on the observed variables. However, this algebraically motivated causal model comprises distribution… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11614v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11614v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11614v1-abstract-full" style="display: none;"> Determining potential probability distributions with a given causal graph is vital for causality studies. To bypass the difficulty in characterizing latent variables in a Bayesian network, the nested Markov model provides an elegant algebraic approach by listing exactly all the equality constraints on the observed variables. However, this algebraically motivated causal model comprises distributions outside Bayesian networks, and its physical interpretation remains vague. In this work, we inspect the nested Markov model through the lens of generalized probabilistic theory, an axiomatic framework to describe general physical theories. We prove that all the equality constraints defining the nested Markov model hold valid theory-independently. Yet, we show this model generally contains distributions not implementable even within such relaxed physical theories subjected to merely the relativity principles and mild probabilistic rules. To interpret the origin of such a gap, we establish a new causal model that defines valid distributions as projected from a high-dimensional Bell-type causal structure. The new model unveils inequality constraints induced by relativity principles, or equivalently high-dimensional conditional independences, which are absent in the nested Markov model. Nevertheless, we also notice that the restrictions on states and measurements introduced by the generalized probabilistic theory framework can pose additional inequality constraints beyond the new causal model. As a by-product, we discover a new causal structure exhibiting strict gaps between the distribution sets of a Bayesian network, generalized probabilistic theories, and the nested Markov model. We anticipate our results will enlighten further explorations on the unification of algebraic and physical perspectives of causality. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11614v1-abstract-full').style.display = 'none'; document.getElementById('2411.11614v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">21 pages, 5 figures, 5 tables; Comments are welcome!</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11507">arXiv:2411.11507</a> <span> [<a href="https://arxiv.org/pdf/2411.11507">pdf</a>, <a href="https://arxiv.org/format/2411.11507">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SignEye: Traffic Sign Interpretation from Vehicle First-Person View </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+C">Chuang Yang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+X">Xu Han</a>, <a href="/search/cs?searchtype=author&query=Han%2C+T">Tao Han</a>, <a href="/search/cs?searchtype=author&query=SU%2C+Y">Yuejiao SU</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+J">Junyu Gao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hongyuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yi Wang</a>, <a href="/search/cs?searchtype=author&query=Chau%2C+L">Lap-Pui Chau</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11507v1-abstract-short" style="display: inline;"> Traffic signs play a key role in assisting autonomous driving systems (ADS) by enabling the assessment of vehicle behavior in compliance with traffic regulations and providing navigation instructions. However, current works are limited to basic sign understanding without considering the egocentric vehicle's spatial position, which fails to support further regulation assessment and direction naviga… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11507v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11507v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11507v1-abstract-full" style="display: none;"> Traffic signs play a key role in assisting autonomous driving systems (ADS) by enabling the assessment of vehicle behavior in compliance with traffic regulations and providing navigation instructions. However, current works are limited to basic sign understanding without considering the egocentric vehicle's spatial position, which fails to support further regulation assessment and direction navigation. Following the above issues, we introduce a new task: traffic sign interpretation from the vehicle's first-person view, referred to as TSI-FPV. Meanwhile, we develop a traffic guidance assistant (TGA) scenario application to re-explore the role of traffic signs in ADS as a complement to popular autonomous technologies (such as obstacle perception). Notably, TGA is not a replacement for electronic map navigation; rather, TGA can be an automatic tool for updating it and complementing it in situations such as offline conditions or temporary sign adjustments. Lastly, a spatial and semantic logic-aware stepwise reasoning pipeline (SignEye) is constructed to achieve the TSI-FPV and TGA, and an application-specific dataset (Traffic-CN) is built. Experiments show that TSI-FPV and TGA are achievable via our SignEye trained on Traffic-CN. The results also demonstrate that the TGA can provide complementary information to ADS beyond existing popular autonomous technologies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11507v1-abstract-full').style.display = 'none'; document.getElementById('2411.11507v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11435">arXiv:2411.11435</a> <span> [<a href="https://arxiv.org/pdf/2411.11435">pdf</a>, <a href="https://arxiv.org/format/2411.11435">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> GLDesigner: Leveraging Multi-Modal LLMs as Designer for Enhanced Aesthetic Text Glyph Layouts </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+J">Junwen He</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yifan Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Lijun Wang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Huchuan Lu</a>, <a href="/search/cs?searchtype=author&query=He%2C+J">Jun-Yan He</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chenyang Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hanyuan Chen</a>, <a href="/search/cs?searchtype=author&query=Lan%2C+J">Jin-Peng Lan</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+B">Bin Luo</a>, <a href="/search/cs?searchtype=author&query=Geng%2C+Y">Yifeng Geng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11435v1-abstract-short" style="display: inline;"> Text logo design heavily relies on the creativity and expertise of professional designers, in which arranging element layouts is one of the most important procedures. However, few attention has been paid to this specific task which needs to take precise textural details and user constraints into consideration, but only on the broader tasks such as document/poster layout generation. In this paper,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11435v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11435v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11435v1-abstract-full" style="display: none;"> Text logo design heavily relies on the creativity and expertise of professional designers, in which arranging element layouts is one of the most important procedures. However, few attention has been paid to this specific task which needs to take precise textural details and user constraints into consideration, but only on the broader tasks such as document/poster layout generation. In this paper, we propose a VLM-based framework that generates content-aware text logo layouts by integrating multi-modal inputs with user constraints, supporting a more flexible and stable layout design in real-world applications. We introduce two model techniques to reduce the computation for processing multiple glyph images simultaneously, while does not face performance degradation. To support instruction-tuning of out model, we construct two extensive text logo datasets, which are 5x more larger than the existing public dataset. Except for the geometric annotations (e.g. text masks and character recognition), we also compliment with comprehensive layout descriptions in natural language format, for more effective training to have reasoning ability when dealing with complex layouts and custom user constraints. Experimental studies demonstrate the effectiveness of our proposed model and datasets, when comparing with previous methods in various benchmarks to evaluate geometric aesthetics and human preferences. The code and datasets will be publicly available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11435v1-abstract-full').style.display = 'none'; document.getElementById('2411.11435v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11421">arXiv:2411.11421</a> <span> [<a href="https://arxiv.org/pdf/2411.11421">pdf</a>, <a href="https://arxiv.org/format/2411.11421">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Towards fast DBSCAN via Spectrum-Preserving Data Compression </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yongyu Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11421v1-abstract-short" style="display: inline;"> This paper introduces a novel method to significantly accelerate DBSCAN by employing spectral data compression. The proposed approach reduces the size of the data set by a factor of five while preserving the essential clustering characteristics through an innovative spectral compression technique. This enables DBSCAN to run substantially faster without any loss of accuracy. Experiments on real-wor… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11421v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11421v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11421v1-abstract-full" style="display: none;"> This paper introduces a novel method to significantly accelerate DBSCAN by employing spectral data compression. The proposed approach reduces the size of the data set by a factor of five while preserving the essential clustering characteristics through an innovative spectral compression technique. This enables DBSCAN to run substantially faster without any loss of accuracy. Experiments on real-world data sets, such as USPS, demonstrate the method's capability to achieve this dramatic reduction in data size while maintaining clustering performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11421v1-abstract-full').style.display = 'none'; document.getElementById('2411.11421v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11351">arXiv:2411.11351</a> <span> [<a href="https://arxiv.org/pdf/2411.11351">pdf</a>, <a href="https://arxiv.org/format/2411.11351">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/TNNLS.2024.3499377">10.1109/TNNLS.2024.3499377 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Visual-Semantic Graph Matching Net for Zero-Shot Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Duan%2C+B">Bowen Duan</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Shiming Chen</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Y">Yufei Guo</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+G">Guo-Sen Xie</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+W">Weiping Ding</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yisong Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11351v1-abstract-short" style="display: inline;"> Zero-shot learning (ZSL) aims to leverage additional semantic information to recognize unseen classes. To transfer knowledge from seen to unseen classes, most ZSL methods often learn a shared embedding space by simply aligning visual embeddings with semantic prototypes. However, methods trained under this paradigm often struggle to learn robust embedding space because they align the two modalities… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11351v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11351v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11351v1-abstract-full" style="display: none;"> Zero-shot learning (ZSL) aims to leverage additional semantic information to recognize unseen classes. To transfer knowledge from seen to unseen classes, most ZSL methods often learn a shared embedding space by simply aligning visual embeddings with semantic prototypes. However, methods trained under this paradigm often struggle to learn robust embedding space because they align the two modalities in an isolated manner among classes, which ignore the crucial class relationship during the alignment process. To address the aforementioned challenges, this paper proposes a Visual-Semantic Graph Matching Net, termed as VSGMN, which leverages semantic relationships among classes to aid in visual-semantic embedding. VSGMN employs a Graph Build Network (GBN) and a Graph Matching Network (GMN) to achieve two-stage visual-semantic alignment. Specifically, GBN first utilizes an embedding-based approach to build visual and semantic graphs in the semantic space and align the embedding with its prototype for first-stage alignment. Additionally, to supplement unseen class relations in these graphs, GBN also build the unseen class nodes based on semantic relationships. In the second stage, GMN continuously integrates neighbor and cross-graph information into the constructed graph nodes, and aligns the node relationships between the two graphs under the class relationship constraint. Extensive experiments on three benchmark datasets demonstrate that VSGMN achieves superior performance in both conventional and generalized ZSL scenarios. The implementation of our VSGMN and experimental results are available at github: https://github.com/dbwfd/VSGMN <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11351v1-abstract-full').style.display = 'none'; document.getElementById('2411.11351v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11276">arXiv:2411.11276</a> <span> [<a href="https://arxiv.org/pdf/2411.11276">pdf</a>, <a href="https://arxiv.org/format/2411.11276">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Fluid Dynamics">physics.flu-dyn</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Coupled Integral PINN for conservation law </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yeping Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Shihao Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11276v1-abstract-short" style="display: inline;"> The Physics-Informed Neural Network (PINN) is an innovative approach to solve a diverse array of partial differential equations (PDEs) leveraging the power of neural networks. This is achieved by minimizing the residual loss associated with the explicit physical information, usually coupled with data derived from initial and boundary conditions. However, a challenge arises in the context of nonlin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11276v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11276v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11276v1-abstract-full" style="display: none;"> The Physics-Informed Neural Network (PINN) is an innovative approach to solve a diverse array of partial differential equations (PDEs) leveraging the power of neural networks. This is achieved by minimizing the residual loss associated with the explicit physical information, usually coupled with data derived from initial and boundary conditions. However, a challenge arises in the context of nonlinear conservation laws where derivatives are undefined at shocks, leading to solutions that deviate from the true physical phenomena. To solve this issue, the physical solution must be extracted from the weak formulation of the PDE and is typically further bounded by entropy conditions. Within the numerical framework, finite volume methods (FVM) are employed to address conservation laws. These methods resolve the integral form of conservation laws and delineate the shock characteristics. Inspired by the principles underlying FVM, this paper introduces a novel Coupled Integrated PINN methodology that involves fitting the integral solutions of equations using additional neural networks. This technique not only augments the conventional PINN's capability in modeling shock waves, but also eliminates the need for spatial and temporal discretization. As such, it bypasses the complexities of numerical integration and reconstruction associated with non-convex fluxes. Finally, we show that the proposed new Integrated PINN performs well in conservative law and outperforms the vanilla PINN when tackle the challenging shock problems using examples of Burger's equation, Buckley-Leverett Equation and Euler System. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11276v1-abstract-full').style.display = 'none'; document.getElementById('2411.11276v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11223">arXiv:2411.11223</a> <span> [<a href="https://arxiv.org/pdf/2411.11223">pdf</a>, <a href="https://arxiv.org/format/2411.11223">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Efficient Transfer Learning for Video-language Foundation Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+H">Haoxing Chen</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Z">Zizheng Huang</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+Y">Yan Hong</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yanshuo Wang</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+Z">Zhongcai Lyu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zhuoer Xu</a>, <a href="/search/cs?searchtype=author&query=Lan%2C+J">Jun Lan</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+Z">Zhangxuan Gu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11223v1-abstract-short" style="display: inline;"> Pre-trained vision-language models provide a robust foundation for efficient transfer learning across various downstream tasks. In the field of video action recognition, mainstream approaches often introduce additional parameter modules to capture temporal information. While the increased model capacity brought by these additional parameters helps better fit the video-specific inductive biases, ex… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11223v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11223v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11223v1-abstract-full" style="display: none;"> Pre-trained vision-language models provide a robust foundation for efficient transfer learning across various downstream tasks. In the field of video action recognition, mainstream approaches often introduce additional parameter modules to capture temporal information. While the increased model capacity brought by these additional parameters helps better fit the video-specific inductive biases, existing methods require learning a large number of parameters and are prone to catastrophic forgetting of the original generalizable knowledge. In this paper, we propose a simple yet effective Multi-modal Spatio-Temporal Adapter (MSTA) to improve the alignment between representations in the text and vision branches, achieving a balance between general knowledge and task-specific knowledge. Furthermore, to mitigate over-fitting and enhance generalizability, we introduce a spatio-temporal description-guided consistency constraint. This constraint involves feeding template inputs (i.e., ``a video of $\{\textbf{cls}\}$'') into the trainable language branch, while LLM-generated spatio-temporal descriptions are input into the pre-trained language branch, enforcing consistency between the outputs of the two branches. This mechanism prevents over-fitting to downstream tasks and improves the distinguishability of the trainable branch within the spatio-temporal semantic space. We evaluate the effectiveness of our approach across four tasks: zero-shot transfer, few-shot learning, base-to-novel generalization, and fully-supervised learning. Compared to many state-of-the-art methods, our MSTA achieves outstanding performance across all evaluations, while using only 2-7\% of the trainable parameters in the original model. Code will be avaliable at https://github.com/chenhaoxing/ETL4Video. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11223v1-abstract-full').style.display = 'none'; document.getElementById('2411.11223v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11221">arXiv:2411.11221</a> <span> [<a href="https://arxiv.org/pdf/2411.11221">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Data Driven Automatic Electrical Machine Preliminary Design with Artificial Intelligence Expert Guidance </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yiwei Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+T">Tao Yang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+H">Hailin Huang</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+T">Tianjie Zou</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jincai Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+N">Nuo Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhuoran Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11221v1-abstract-short" style="display: inline;"> This paper presents a data-driven electrical machine design (EMD) framework using wound-rotor synchronous generator (WRSG) as a design example. Unlike traditional preliminary EMD processes that heavily rely on expertise, this framework leverages an artificial-intelligence based expert database, to provide preliminary designs directly from user specifications. Initial data is generated using 2D fin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11221v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11221v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11221v1-abstract-full" style="display: none;"> This paper presents a data-driven electrical machine design (EMD) framework using wound-rotor synchronous generator (WRSG) as a design example. Unlike traditional preliminary EMD processes that heavily rely on expertise, this framework leverages an artificial-intelligence based expert database, to provide preliminary designs directly from user specifications. Initial data is generated using 2D finite element (FE) machine models by sweeping fundamental design variables including machine length and diameter, enabling scalable machine geometry with machine performance for each design is recorded. This data trains a Metamodel of Optimal Prognosis (MOP)-based surrogate model, which maps design variables to key performance indicators (KPIs). Once trained, guided by metaheuristic algorithms, the surrogate model can generate thousands of geometric scalable designs, covering a wide power range, forming an AI expert database to guide future preliminary design. The framework is validated with a 30kVA WRSG design case. A prebuilt WRSG database, covering power from 10 to 60kVA, is validated by FE simulation. Design No.1138 is selected from database and compared with conventional design. Results show No.1138 achieves a higher power density of 2.21 kVA/kg in just 5 seconds, compared to 2.02 kVA/kg obtained using traditional method, which take several days. The developed AI expert database also serves as a high-quality data source for further developing AI models for automatic electrical machine design. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11221v1-abstract-full').style.display = 'none'; document.getElementById('2411.11221v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11148">arXiv:2411.11148</a> <span> [<a href="https://arxiv.org/pdf/2411.11148">pdf</a>, <a href="https://arxiv.org/format/2411.11148">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> TabDeco: A Comprehensive Contrastive Framework for Decoupled Representations in Tabular Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+S">Suiyao Chen</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jing Wu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yunxiao Wang</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+C">Cheng Ji</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+T">Tianpei Xie</a>, <a href="/search/cs?searchtype=author&query=Cociorva%2C+D">Daniel Cociorva</a>, <a href="/search/cs?searchtype=author&query=Sharps%2C+M">Michael Sharps</a>, <a href="/search/cs?searchtype=author&query=Levasseur%2C+C">Cecile Levasseur</a>, <a href="/search/cs?searchtype=author&query=Brunzell%2C+H">Hakan Brunzell</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11148v1-abstract-short" style="display: inline;"> Representation learning is a fundamental aspect of modern artificial intelligence, driving substantial improvements across diverse applications. While selfsupervised contrastive learning has led to significant advancements in fields like computer vision and natural language processing, its adaptation to tabular data presents unique challenges. Traditional approaches often prioritize optimizing mod… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11148v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11148v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11148v1-abstract-full" style="display: none;"> Representation learning is a fundamental aspect of modern artificial intelligence, driving substantial improvements across diverse applications. While selfsupervised contrastive learning has led to significant advancements in fields like computer vision and natural language processing, its adaptation to tabular data presents unique challenges. Traditional approaches often prioritize optimizing model architecture and loss functions but may overlook the crucial task of constructing meaningful positive and negative sample pairs from various perspectives like feature interactions, instance-level patterns and batch-specific contexts. To address these challenges, we introduce TabDeco, a novel method that leverages attention-based encoding strategies across both rows and columns and employs contrastive learning framework to effectively disentangle feature representations at multiple levels, including features, instances and data batches. With the innovative feature decoupling hierarchies, TabDeco consistently surpasses existing deep learning methods and leading gradient boosting algorithms, including XG-Boost, CatBoost, and LightGBM, across various benchmark tasks, underscoring its effectiveness in advancing tabular data representation learning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11148v1-abstract-full').style.display = 'none'; document.getElementById('2411.11148v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11135">arXiv:2411.11135</a> <span> [<a href="https://arxiv.org/pdf/2411.11135">pdf</a>, <a href="https://arxiv.org/format/2411.11135">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Oscillation Inversion: Understand the structure of Large Flow Model through the Lens of Inversion Method </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zheng%2C+Y">Yan Zheng</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+Z">Zhenxiao Liang</a>, <a href="/search/cs?searchtype=author&query=Cong%2C+X">Xiaoyan Cong</a>, <a href="/search/cs?searchtype=author&query=guo%2C+L">Lanqing guo</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yuehao Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+P">Peihao Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhangyang Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11135v1-abstract-short" style="display: inline;"> We explore the oscillatory behavior observed in inversion methods applied to large-scale text-to-image diffusion models, with a focus on the "Flux" model. By employing a fixed-point-inspired iterative approach to invert real-world images, we observe that the solution does not achieve convergence, instead oscillating between distinct clusters. Through both toy experiments and real-world diffusion m… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11135v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11135v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11135v1-abstract-full" style="display: none;"> We explore the oscillatory behavior observed in inversion methods applied to large-scale text-to-image diffusion models, with a focus on the "Flux" model. By employing a fixed-point-inspired iterative approach to invert real-world images, we observe that the solution does not achieve convergence, instead oscillating between distinct clusters. Through both toy experiments and real-world diffusion models, we demonstrate that these oscillating clusters exhibit notable semantic coherence. We offer theoretical insights, showing that this behavior arises from oscillatory dynamics in rectified flow models. Building on this understanding, we introduce a simple and fast distribution transfer technique that facilitates image enhancement, stroke-based recoloring, as well as visual prompt-guided image editing. Furthermore, we provide quantitative results demonstrating the effectiveness of our method for tasks such as image enhancement, makeup transfer, reconstruction quality, and guided sampling quality. Higher-quality examples of videos and images are available at \href{https://yanyanzheng96.github.io/oscillation_inversion/}{this link}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11135v1-abstract-full').style.display = 'none'; document.getElementById('2411.11135v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11027">arXiv:2411.11027</a> <span> [<a href="https://arxiv.org/pdf/2411.11027">pdf</a>, <a href="https://arxiv.org/format/2411.11027">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> BianCang: A Traditional Chinese Medicine Large Language Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wei%2C+S">Sibo Wei</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+X">Xueping Peng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yi-fei Wang</a>, <a href="/search/cs?searchtype=author&query=Si%2C+J">Jiasheng Si</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Weiyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+W">Wenpeng Lu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+X">Xiaoming Wu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yinglong Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11027v1-abstract-short" style="display: inline;"> The rise of large language models (LLMs) has driven significant progress in medical applications, including traditional Chinese medicine (TCM). However, current medical LLMs struggle with TCM diagnosis and syndrome differentiation due to substantial differences between TCM and modern medical theory, and the scarcity of specialized, high-quality corpora. This paper addresses these challenges by pro… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11027v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11027v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11027v1-abstract-full" style="display: none;"> The rise of large language models (LLMs) has driven significant progress in medical applications, including traditional Chinese medicine (TCM). However, current medical LLMs struggle with TCM diagnosis and syndrome differentiation due to substantial differences between TCM and modern medical theory, and the scarcity of specialized, high-quality corpora. This paper addresses these challenges by proposing BianCang, a TCM-specific LLM, using a two-stage training process that first injects domain-specific knowledge and then aligns it through targeted stimulation. To enhance diagnostic and differentiation capabilities, we constructed pre-training corpora, instruction-aligned datasets based on real hospital records, and the ChP-TCM dataset derived from the Pharmacopoeia of the People's Republic of China. We compiled extensive TCM and medical corpora for continuous pre-training and supervised fine-tuning, building a comprehensive dataset to refine the model's understanding of TCM. Evaluations across 11 test sets involving 29 models and 4 tasks demonstrate the effectiveness of BianCang, offering valuable insights for future research. Code, datasets, and models are available at https://github.com/QLU-NLP/BianCang. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11027v1-abstract-full').style.display = 'none'; document.getElementById('2411.11027v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10998">arXiv:2411.10998</a> <span> [<a href="https://arxiv.org/pdf/2411.10998">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computational Engineering, Finance, and Science">cs.CE</span> </div> </div> <p class="title is-5 mathjax"> Image-Based RKPM for Accessing Failure Mechanisms in Composite Materials </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yanran Wang</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Y">Yichun Tang</a>, <a href="/search/cs?searchtype=author&query=Du%2C+J">Jing Du</a>, <a href="/search/cs?searchtype=author&query=Hillman%2C+M">Mike Hillman</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J+S">J. S. Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10998v1-abstract-short" style="display: inline;"> Stress distributions and the corresponding fracture patterns and evolutions in the microstructures strongly influence the load-carrying capabilities of composite structures. This work introduces an enhanced phase-field fracture model incorporating interface decohesion to simulate fracture propagation and interactions at material interfaces and within the constituents of composite microstructures.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10998v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10998v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10998v1-abstract-full" style="display: none;"> Stress distributions and the corresponding fracture patterns and evolutions in the microstructures strongly influence the load-carrying capabilities of composite structures. This work introduces an enhanced phase-field fracture model incorporating interface decohesion to simulate fracture propagation and interactions at material interfaces and within the constituents of composite microstructures. The proposed method employs an interface-modified reproducing kernel (IM-RK) approximation for handling cross-interface discontinuities constructed from image voxels and guided by Support Vector Machine (SVM) ma-terial classification. The numerical models are directly generated from X-ray microtomography image voxels, guided by SVM using voxel color code information. Additionally, a strain energy-based phase field variable is introduced, eliminating the need to solve coupled field problems. The effectiveness of this method is demonstrated in modeling crack growth both along interfaces and across matrix and inclusion domains and in predicting the corresponding structural-scale mechanical behavior in composite structures. Furthermore, the proposed method has been validated against experimentally observed crack patterns. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10998v1-abstract-full').style.display = 'none'; document.getElementById('2411.10998v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">16 pages, 9 figures. arXiv admin note: text overlap with arXiv:2305.16402</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10982">arXiv:2411.10982</a> <span> [<a href="https://arxiv.org/pdf/2411.10982">pdf</a>, <a href="https://arxiv.org/format/2411.10982">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Methodology">stat.ME</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Towards a framework on tabular synthetic data generation: a minimalist approach: theory, use cases, and limitations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shen%2C+Y">Yueyang Shen</a>, <a href="/search/cs?searchtype=author&query=Sudjianto%2C+A">Agus Sudjianto</a>, <a href="/search/cs?searchtype=author&query=R%2C+A+P">Arun Prakash R</a>, <a href="/search/cs?searchtype=author&query=Bhattacharyya%2C+A">Anwesha Bhattacharyya</a>, <a href="/search/cs?searchtype=author&query=Rao%2C+M">Maorong Rao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yaqun Wang</a>, <a href="/search/cs?searchtype=author&query=Vaughan%2C+J">Joel Vaughan</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+N">Nengfeng Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10982v2-abstract-short" style="display: inline;"> We propose and study a minimalist approach towards synthetic tabular data generation. The model consists of a minimalistic unsupervised SparsePCA encoder (with contingent clustering step or log transformation to handle nonlinearity) and XGboost decoder which is SOTA for structured data regression and classification tasks. We study and contrast the methodologies with (variational) autoencoders in s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10982v2-abstract-full').style.display = 'inline'; document.getElementById('2411.10982v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10982v2-abstract-full" style="display: none;"> We propose and study a minimalist approach towards synthetic tabular data generation. The model consists of a minimalistic unsupervised SparsePCA encoder (with contingent clustering step or log transformation to handle nonlinearity) and XGboost decoder which is SOTA for structured data regression and classification tasks. We study and contrast the methodologies with (variational) autoencoders in several toy low dimensional scenarios to derive necessary intuitions. The framework is applied to high dimensional simulated credit scoring data which parallels real-life financial applications. We applied the method to robustness testing to demonstrate practical use cases. The case study result suggests that the method provides an alternative to raw and quantile perturbation for model robustness testing. We show that the method is simplistic, guarantees interpretability all the way through, does not require extra tuning and provide unique benefits. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10982v2-abstract-full').style.display = 'none'; document.getElementById('2411.10982v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10948">arXiv:2411.10948</a> <span> [<a href="https://arxiv.org/pdf/2411.10948">pdf</a>, <a href="https://arxiv.org/format/2411.10948">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Towards Accurate and Efficient Sub-8-Bit Integer Training </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+W">Wenjin Guo</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+D">Donglai Liu</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+W">Weiying Xie</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yunsong Li</a>, <a href="/search/cs?searchtype=author&query=Ning%2C+X">Xuefei Ning</a>, <a href="/search/cs?searchtype=author&query=Meng%2C+Z">Zihan Meng</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+S">Shulin Zeng</a>, <a href="/search/cs?searchtype=author&query=Lei%2C+J">Jie Lei</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Z">Zhenman Fang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yu Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10948v1-abstract-short" style="display: inline;"> Neural network training is a memory- and compute-intensive task. Quantization, which enables low-bitwidth formats in training, can significantly mitigate the workload. To reduce quantization error, recent methods have developed new data formats and additional pre-processing operations on quantizers. However, it remains quite challenging to achieve high accuracy and efficiency simultaneously. In th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10948v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10948v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10948v1-abstract-full" style="display: none;"> Neural network training is a memory- and compute-intensive task. Quantization, which enables low-bitwidth formats in training, can significantly mitigate the workload. To reduce quantization error, recent methods have developed new data formats and additional pre-processing operations on quantizers. However, it remains quite challenging to achieve high accuracy and efficiency simultaneously. In this paper, we explore sub-8-bit integer training from its essence of gradient descent optimization. Our integer training framework includes two components: ShiftQuant to realize accurate gradient estimation, and L1 normalization to smoothen the loss landscape. ShiftQuant attains performance that approaches the theoretical upper bound of group quantization. Furthermore, it liberates group quantization from inefficient memory rearrangement. The L1 normalization facilitates the implementation of fully quantized normalization layers with impressive convergence accuracy. Our method frees sub-8-bit integer training from pre-processing and supports general devices. This framework achieves negligible accuracy loss across various neural networks and tasks ($0.92\%$ on 4-bit ResNets, $0.61\%$ on 6-bit Transformers). The prototypical implementation of ShiftQuant achieves more than $1.85\times/15.3\%$ performance improvement on CPU/GPU compared to its FP16 counterparts, and $33.9\%$ resource consumption reduction on FPGA than the FP16 counterparts. The proposed fully-quantized L1 normalization layers achieve more than $35.54\%$ improvement in throughout on CPU compared to traditional L2 normalization layers. Moreover, theoretical analysis verifies the advancement of our method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10948v1-abstract-full').style.display = 'none'; document.getElementById('2411.10948v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10836">arXiv:2411.10836</a> <span> [<a href="https://arxiv.org/pdf/2411.10836">pdf</a>, <a href="https://arxiv.org/format/2411.10836">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> AnimateAnything: Consistent and Controllable Animation for Video Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lei%2C+G">Guojun Lei</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chi Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Hong Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Rong Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yikai Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+W">Weiwei Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10836v1-abstract-short" style="display: inline;"> We present a unified controllable video generation approach AnimateAnything that facilitates precise and consistent video manipulation across various conditions, including camera trajectories, text prompts, and user motion annotations. Specifically, we carefully design a multi-scale control feature fusion network to construct a common motion representation for different conditions. It explicitly c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10836v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10836v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10836v1-abstract-full" style="display: none;"> We present a unified controllable video generation approach AnimateAnything that facilitates precise and consistent video manipulation across various conditions, including camera trajectories, text prompts, and user motion annotations. Specifically, we carefully design a multi-scale control feature fusion network to construct a common motion representation for different conditions. It explicitly converts all control information into frame-by-frame optical flows. Then we incorporate the optical flows as motion priors to guide final video generation. In addition, to reduce the flickering issues caused by large-scale motion, we propose a frequency-based stabilization module. It can enhance temporal coherence by ensuring the video's frequency domain consistency. Experiments demonstrate that our method outperforms the state-of-the-art approaches. For more details and videos, please refer to the webpage: https://yu-shaonian.github.io/Animate_Anything/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10836v1-abstract-full').style.display = 'none'; document.getElementById('2411.10836v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10798">arXiv:2411.10798</a> <span> </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Unveiling Hidden Details: A RAW Data-Enhanced Paradigm for Real-World Super-Resolution </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Peng%2C+L">Long Peng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wenbo Li</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+J">Jiaming Guo</a>, <a href="/search/cs?searchtype=author&query=Di%2C+X">Xin Di</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+H">Haoze Sun</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yong Li</a>, <a href="/search/cs?searchtype=author&query=Pei%2C+R">Renjing Pei</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yang Wang</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+Y">Yang Cao</a>, <a href="/search/cs?searchtype=author&query=Zha%2C+Z">Zheng-Jun Zha</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10798v2-abstract-short" style="display: inline;"> Real-world image super-resolution (Real SR) aims to generate high-fidelity, detail-rich high-resolution (HR) images from low-resolution (LR) counterparts. Existing Real SR methods primarily focus on generating details from the LR RGB domain, often leading to a lack of richness or fidelity in fine details. In this paper, we pioneer the use of details hidden in RAW data to complement existing RGB-on… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10798v2-abstract-full').style.display = 'inline'; document.getElementById('2411.10798v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10798v2-abstract-full" style="display: none;"> Real-world image super-resolution (Real SR) aims to generate high-fidelity, detail-rich high-resolution (HR) images from low-resolution (LR) counterparts. Existing Real SR methods primarily focus on generating details from the LR RGB domain, often leading to a lack of richness or fidelity in fine details. In this paper, we pioneer the use of details hidden in RAW data to complement existing RGB-only methods, yielding superior outputs. We argue that key image processing steps in Image Signal Processing, such as denoising and demosaicing, inherently result in the loss of fine details in LR images, making LR RAW a valuable information source. To validate this, we present RealSR-RAW, a comprehensive dataset comprising over 10,000 pairs with LR and HR RGB images, along with corresponding LR RAW, captured across multiple smartphones under varying focal lengths and diverse scenes. Additionally, we propose a novel, general RAW adapter to efficiently integrate LR RAW data into existing CNNs, Transformers, and Diffusion-based Real SR models by suppressing the noise contained in LR RAW and aligning its distribution. Extensive experiments demonstrate that incorporating RAW data significantly enhances detail recovery and improves Real SR performance across ten evaluation metrics, including both fidelity and perception-oriented metrics. Our findings open a new direction for the Real SR task, with the dataset and code will be made available to support future research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10798v2-abstract-full').style.display = 'none'; document.getElementById('2411.10798v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">We sincerely apologize, but due to some commercial confidentiality agreements related to the report, we have decided to withdraw the submission for now and will resubmit after making the necessary revisions</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10709">arXiv:2411.10709</a> <span> [<a href="https://arxiv.org/pdf/2411.10709">pdf</a>, <a href="https://arxiv.org/format/2411.10709">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Diagnostic Text-guided Representation Learning in Hierarchical Classification for Pathological Whole Slide Image </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+J">Jiawen Li</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Q">Qiehe Sun</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+R">Renao Yan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yizhi Wang</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+Y">Yuqiu Fu</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+Y">Yani Wei</a>, <a href="/search/cs?searchtype=author&query=Guan%2C+T">Tian Guan</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+H">Huijuan Shi</a>, <a href="/search/cs?searchtype=author&query=He%2C+Y">Yonghonghe He</a>, <a href="/search/cs?searchtype=author&query=Han%2C+A">Anjia Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10709v1-abstract-short" style="display: inline;"> With the development of digital imaging in medical microscopy, artificial intelligent-based analysis of pathological whole slide images (WSIs) provides a powerful tool for cancer diagnosis. Limited by the expensive cost of pixel-level annotation, current research primarily focuses on representation learning with slide-level labels, showing success in various downstream tasks. However, given the di… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10709v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10709v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10709v1-abstract-full" style="display: none;"> With the development of digital imaging in medical microscopy, artificial intelligent-based analysis of pathological whole slide images (WSIs) provides a powerful tool for cancer diagnosis. Limited by the expensive cost of pixel-level annotation, current research primarily focuses on representation learning with slide-level labels, showing success in various downstream tasks. However, given the diversity of lesion types and the complex relationships between each other, these techniques still deserve further exploration in addressing advanced pathology tasks. To this end, we introduce the concept of hierarchical pathological image classification and propose a representation learning called PathTree. PathTree considers the multi-classification of diseases as a binary tree structure. Each category is represented as a professional pathological text description, which messages information with a tree-like encoder. The interactive text features are then used to guide the aggregation of hierarchical multiple representations. PathTree uses slide-text similarity to obtain probability scores and introduces two extra tree specific losses to further constrain the association between texts and slides. Through extensive experiments on three challenging hierarchical classification datasets: in-house cryosectioned lung tissue lesion identification, public prostate cancer grade assessment, and public breast cancer subtyping, our proposed PathTree is consistently competitive compared to the state-of-the-art methods and provides a new perspective on the deep learning-assisted solution for more complex WSI classification. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10709v1-abstract-full').style.display = 'none'; document.getElementById('2411.10709v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages, 13 figures. Under Review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10707">arXiv:2411.10707</a> <span> [<a href="https://arxiv.org/pdf/2411.10707">pdf</a>, <a href="https://arxiv.org/format/2411.10707">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Data Structures and Algorithms">cs.DS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> </div> </div> <p class="title is-5 mathjax"> Differentiable Extensions with Rounding Guarantees for Combinatorial Optimization over Permutations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Nerem%2C+R+R">Robert R. Nerem</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+Z">Zhishang Luo</a>, <a href="/search/cs?searchtype=author&query=Rafiey%2C+A">Akbar Rafiey</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yusu Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10707v1-abstract-short" style="display: inline;"> We present Birkhoff Extension (BE), an almost-everywhere-differentiable continuous polytime-computable extension of any real-valued function on permutations to doubly stochastic matrices. Our approach is based on Birkhoff decomposition (also referred to as Birkhoff von-Neumann decomposition) which allows construction of an extension that is always a convex combination of the objective's values at… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10707v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10707v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10707v1-abstract-full" style="display: none;"> We present Birkhoff Extension (BE), an almost-everywhere-differentiable continuous polytime-computable extension of any real-valued function on permutations to doubly stochastic matrices. Our approach is based on Birkhoff decomposition (also referred to as Birkhoff von-Neumann decomposition) which allows construction of an extension that is always a convex combination of the objective's values at permutations. We show how to construct a specific family of Birkhoff decompositions that are continuous. In addition to continuity, our extension has several nice properties making it appealing for optimization problems. First, BE provides a rounding guarantee, namely any solution to the extension can be efficiently rounded to a permutation without increasing the function value. Furthermore, an approximate solution in the relaxed case (with extension) will give rise to an approximate solution in the space of permutations. Second, using BE, any real-valued optimization objective on permutations can be extended to an almost everywhere differentiable objective function over the space of doubly stochastic matrices. This makes our BE amenable to not only gradient-descent based optimizations, but also unsupervised neural combinatorial optimization where training often requires a differentiable loss. Third, based on the above properties, we present a simple optimization procedure which can be readily combined with existing optimization approaches to offer local improvements (i.e., the quality of the final solution is no worse than the initial solution). We present preliminary experimental results to verify our theoretical results on several combinatorial optimization problems related to permutations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10707v1-abstract-full').style.display = 'none'; document.getElementById('2411.10707v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10701">arXiv:2411.10701</a> <span> [<a href="https://arxiv.org/pdf/2411.10701">pdf</a>, <a href="https://arxiv.org/format/2411.10701">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Diffusion-based Layer-wise Semantic Reconstruction for Unsupervised Out-of-Distribution Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Ying Yang</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+D">De Cheng</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+C">Chaowei Fang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yubiao Wang</a>, <a href="/search/cs?searchtype=author&query=Jiao%2C+C">Changzhe Jiao</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+L">Lechao Cheng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+N">Nannan Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10701v1-abstract-short" style="display: inline;"> Unsupervised out-of-distribution (OOD) detection aims to identify out-of-domain data by learning only from unlabeled In-Distribution (ID) training samples, which is crucial for developing a safe real-world machine learning system. Current reconstruction-based methods provide a good alternative approach by measuring the reconstruction error between the input and its corresponding generative counter… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10701v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10701v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10701v1-abstract-full" style="display: none;"> Unsupervised out-of-distribution (OOD) detection aims to identify out-of-domain data by learning only from unlabeled In-Distribution (ID) training samples, which is crucial for developing a safe real-world machine learning system. Current reconstruction-based methods provide a good alternative approach by measuring the reconstruction error between the input and its corresponding generative counterpart in the pixel/feature space. However, such generative methods face a key dilemma: improving the reconstruction power of the generative model while keeping a compact representation of the ID data. To address this issue, we propose the diffusion-based layer-wise semantic reconstruction approach for unsupervised OOD detection. The innovation of our approach is that we leverage the diffusion model's intrinsic data reconstruction ability to distinguish ID samples from OOD samples in the latent feature space. Moreover, to set up a comprehensive and discriminative feature representation, we devise a multi-layer semantic feature extraction strategy. By distorting the extracted features with Gaussian noise and applying the diffusion model for feature reconstruction, the separation of ID and OOD samples is implemented according to the reconstruction errors. Extensive experimental results on multiple benchmarks built upon various datasets demonstrate that our method achieves state-of-the-art performance in terms of detection accuracy and speed. Code is available at <https://github.com/xbyym/DLSR>. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10701v1-abstract-full').style.display = 'none'; document.getElementById('2411.10701v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">26 pages, 23 figures, published to Neurlps2024</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Proceedings of the 38th Conference on Neural Information Processing Systems (NeurIPS 2024) </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10687">arXiv:2411.10687</a> <span> [<a href="https://arxiv.org/pdf/2411.10687">pdf</a>, <a href="https://arxiv.org/format/2411.10687">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> EDBooks: AI-Enhanced Interactive Narratives for Programming Education </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Oney%2C+S">Steve Oney</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+Y">Yue Shen</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+F">Fei Wu</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+Y+S">Young Suh Hong</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Ziang Wang</a>, <a href="/search/cs?searchtype=author&query=Khajekar%2C+Y">Yamini Khajekar</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiacheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+A+Y">April Yi Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10687v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) have shown the potential to be valuable teaching tools, with the potential of giving every student a personalized tutor. However, one challenge with using LLMs to learn new concepts is that when learning a topic in an unfamiliar domain, it can be difficult to know what questions to ask. Further, language models do not always encourage "active learning" where students c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10687v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10687v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10687v1-abstract-full" style="display: none;"> Large Language Models (LLMs) have shown the potential to be valuable teaching tools, with the potential of giving every student a personalized tutor. However, one challenge with using LLMs to learn new concepts is that when learning a topic in an unfamiliar domain, it can be difficult to know what questions to ask. Further, language models do not always encourage "active learning" where students can test and assess their understanding. In this paper, we propose ways to combine large language models with "traditional" learning materials (like e-books) to give readers the benefits of working with LLMs (the ability to ask personally interesting questions and receive personalized answers) with the benefits of a traditional e-book (having a structure and content that is pedagogically sound). This work shows one way that LLMs have the potential to improve learning materials and make personalized programming education more accessible to a broader audience. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10687v1-abstract-full').style.display = 'none'; document.getElementById('2411.10687v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">21 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10340">arXiv:2411.10340</a> <span> [<a href="https://arxiv.org/pdf/2411.10340">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> Domain Adaptation-based Edge Computing for Cross-Conditions Fault Diagnosis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yanzhi Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chu Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jinhong Wu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Z">Ziyang Yu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Q">Qi Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10340v1-abstract-short" style="display: inline;"> Fault diagnosis technology supports the healthy operation of mechanical equipment. However, the variations conditions during the operation of mechanical equipment lead to significant disparities in data distribution, posing challenges to fault diagnosis. Furthermore, when deploying applications, traditional methods often encounter issues such as latency and data security. Therefore, conducting fau… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10340v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10340v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10340v1-abstract-full" style="display: none;"> Fault diagnosis technology supports the healthy operation of mechanical equipment. However, the variations conditions during the operation of mechanical equipment lead to significant disparities in data distribution, posing challenges to fault diagnosis. Furthermore, when deploying applications, traditional methods often encounter issues such as latency and data security. Therefore, conducting fault diagnosis and deploying application methods under cross-operating conditions holds significant value. This paper proposes a domain adaptation-based lightweight fault diagnosis framework for edge computing scenarios. Incorporating the local maximum mean discrepancy into knowledge transfer aligns the feature distributions of different domains in a high-dimensional feature space, to discover a common feature space across domains. The acquired fault diagnosis expertise from the cloud-model is transferred to the lightweight edge-model using adaptation knowledge transfer methods. While ensuring real-time diagnostic capabilities, accurate fault diagnosis is achieved across working conditions. We conducted validation experiments on the NVIDIA Jetson Xavier NX kit. In terms of diagnostic performance, the proposed method significantly improved diagnostic accuracy, with average increases of 34.44% and 17.33% compared to the comparison method, respectively. Regarding lightweight effectiveness, proposed method achieved an average inference speed increase of 80.47%. Additionally, compared to the cloud-model, the parameter count of the edge-model decreased by 96.37%, while the Flops decreased by 83.08%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10340v1-abstract-full').style.display = 'none'; document.getElementById('2411.10340v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">28 pages, 11 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10195">arXiv:2411.10195</a> <span> [<a href="https://arxiv.org/pdf/2411.10195">pdf</a>, <a href="https://arxiv.org/format/2411.10195">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> BEV-ODOM: Reducing Scale Drift in Monocular Visual Odometry with BEV Representation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wei%2C+Y">Yufei Wei</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+S">Sha Lu</a>, <a href="/search/cs?searchtype=author&query=Han%2C+F">Fuzhang Han</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+R">Rong Xiong</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yue Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10195v1-abstract-short" style="display: inline;"> Monocular visual odometry (MVO) is vital in autonomous navigation and robotics, providing a cost-effective and flexible motion tracking solution, but the inherent scale ambiguity in monocular setups often leads to cumulative errors over time. In this paper, we present BEV-ODOM, a novel MVO framework leveraging the Bird's Eye View (BEV) Representation to address scale drift. Unlike existing approac… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10195v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10195v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10195v1-abstract-full" style="display: none;"> Monocular visual odometry (MVO) is vital in autonomous navigation and robotics, providing a cost-effective and flexible motion tracking solution, but the inherent scale ambiguity in monocular setups often leads to cumulative errors over time. In this paper, we present BEV-ODOM, a novel MVO framework leveraging the Bird's Eye View (BEV) Representation to address scale drift. Unlike existing approaches, BEV-ODOM integrates a depth-based perspective-view (PV) to BEV encoder, a correlation feature extraction neck, and a CNN-MLP-based decoder, enabling it to estimate motion across three degrees of freedom without the need for depth supervision or complex optimization techniques. Our framework reduces scale drift in long-term sequences and achieves accurate motion estimation across various datasets, including NCLT, Oxford, and KITTI. The results indicate that BEV-ODOM outperforms current MVO methods, demonstrating reduced scale drift and higher accuracy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10195v1-abstract-full').style.display = 'none'; document.getElementById('2411.10195v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10143">arXiv:2411.10143</a> <span> [<a href="https://arxiv.org/pdf/2411.10143">pdf</a>, <a href="https://arxiv.org/format/2411.10143">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Mathematical Software">cs.MS</span> </div> </div> <p class="title is-5 mathjax"> Cascaded Prediction and Asynchronous Execution of Iterative Algorithms on Heterogeneous Platforms </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gao%2C+J">Jianhua Gao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+B">Bingjie Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yizhuo Wang</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+W">Weixing Ji</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+H">Hua Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10143v1-abstract-short" style="display: inline;"> Owing to the diverse scales and varying distributions of sparse matrices arising from practical problems, a multitude of choices are present in the design and implementation of sparse matrix-vector multiplication (SpMV). Researchers have proposed many machine learning-based optimization methods for SpMV. However, these efforts only support one area of sparse matrix format selection, SpMV algorithm… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10143v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10143v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10143v1-abstract-full" style="display: none;"> Owing to the diverse scales and varying distributions of sparse matrices arising from practical problems, a multitude of choices are present in the design and implementation of sparse matrix-vector multiplication (SpMV). Researchers have proposed many machine learning-based optimization methods for SpMV. However, these efforts only support one area of sparse matrix format selection, SpMV algorithm selection, or parameter configuration, and rarely consider a large amount of time overhead associated with feature extraction, model inference, and compression format conversion. This paper introduces a machine learning-based cascaded prediction method for SpMV computations that spans various computing stages and hierarchies. Besides, an asynchronous and concurrent computing model has been designed and implemented for runtime model prediction and iterative algorithm solving on heterogeneous computing platforms. It not only offers comprehensive support for the iterative algorithm-solving process leveraging machine learning technology, but also effectively mitigates the preprocessing overheads. Experimental results demonstrate that the cascaded prediction introduced in this paper accelerates SpMV by 1.33x on average, and the iterative algorithm, enhanced by cascaded prediction and asynchronous execution, optimizes by 2.55x on average. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10143v1-abstract-full').style.display = 'none'; document.getElementById('2411.10143v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 9 figures, 7 tables</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68-02; 68W10; 65F50 <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> A.1; D.1.3; G.1.3 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10058">arXiv:2411.10058</a> <span> [<a href="https://arxiv.org/pdf/2411.10058">pdf</a>, <a href="https://arxiv.org/format/2411.10058">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/TSG.2020.3011266">10.1109/TSG.2020.3011266 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Unsupervised Congestion Status Identification Using LMP Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zheng%2C+K">Kedi Zheng</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Q">Qixin Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yi Wang</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+C">Chongqing Kang</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+L">Le Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10058v1-abstract-short" style="display: inline;"> Having a better understanding of how locational marginal prices (LMPs) change helps in price forecasting and market strategy making. This paper investigates the fundamental distribution of the congestion part of LMPs in high-dimensional Euclidean space using an unsupervised approach. LMP models based on the lossless and lossy DC optimal power flow (DC-OPF) are analyzed to show the overlapping subs… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10058v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10058v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10058v1-abstract-full" style="display: none;"> Having a better understanding of how locational marginal prices (LMPs) change helps in price forecasting and market strategy making. This paper investigates the fundamental distribution of the congestion part of LMPs in high-dimensional Euclidean space using an unsupervised approach. LMP models based on the lossless and lossy DC optimal power flow (DC-OPF) are analyzed to show the overlapping subspace property of the LMP data. The congestion part of LMPs is spanned by certain row vectors of the power transfer distribution factor (PTDF) matrix, and the subspace attributes of an LMP vector uniquely are found to reflect the instantaneous congestion status of all the transmission lines. The proposed method searches for the basis vectors that span the subspaces of congestion LMP data in hierarchical ways. In the bottom-up search, the data belonging to 1-dimensional subspaces are detected, and other data are projected on the orthogonal subspaces. This procedure is repeated until all the basis vectors are found or the basis gap appears. Top-down searching is used to address the basis gap by hyperplane detection with outliers. Once all the basis vectors are detected, the congestion status can be identified. Numerical experiments based on the IEEE 30-bus system, IEEE 118-bus system, Illinois 200-bus system, and Southwest Power Pool are conducted to show the performance of the proposed method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10058v1-abstract-full').style.display = 'none'; document.getElementById('2411.10058v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Paper accepted for IEEE Transactions on Smart Grid. Personal use of this material is permitted. Permission from IEEE must be obtained for all other uses</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> in IEEE Transactions on Smart Grid, vol. 12, no. 1, pp. 726-736, Jan. 2021 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10000">arXiv:2411.10000</a> <span> [<a href="https://arxiv.org/pdf/2411.10000">pdf</a>, <a href="https://arxiv.org/format/2411.10000">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> DuSEGO: Dual Second-order Equivariant Graph Ordinary Differential Equation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yingxu Wang</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+N">Nan Yin</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+M">Mingyan Xiao</a>, <a href="/search/cs?searchtype=author&query=Yi%2C+X">Xinhao Yi</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Siwei Liu</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+S">Shangsong Liang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10000v1-abstract-short" style="display: inline;"> Graph Neural Networks (GNNs) with equivariant properties have achieved significant success in modeling complex dynamic systems and molecular properties. However, their expressiveness ability is limited by: (1) Existing methods often overlook the over-smoothing issue caused by traditional GNN models, as well as the gradient explosion or vanishing problems in deep GNNs. (2) Most models operate on fi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10000v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10000v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10000v1-abstract-full" style="display: none;"> Graph Neural Networks (GNNs) with equivariant properties have achieved significant success in modeling complex dynamic systems and molecular properties. However, their expressiveness ability is limited by: (1) Existing methods often overlook the over-smoothing issue caused by traditional GNN models, as well as the gradient explosion or vanishing problems in deep GNNs. (2) Most models operate on first-order information, neglecting that the real world often consists of second-order systems, which further limits the model's representation capabilities. To address these issues, we propose the \textbf{Du}al \textbf{S}econd-order \textbf{E}quivariant \textbf{G}raph \textbf{O}rdinary Differential Equation (\method{}) for equivariant representation. Specifically, \method{} apply the dual second-order equivariant graph ordinary differential equations (Graph ODEs) on graph embeddings and node coordinates, simultaneously. Theoretically, we first prove that \method{} maintains the equivariant property. Furthermore, we provide theoretical insights showing that \method{} effectively alleviates the over-smoothing problem in both feature representation and coordinate update. Additionally, we demonstrate that the proposed \method{} mitigates the exploding and vanishing gradients problem, facilitating the training of deep multi-layer GNNs. Extensive experiments on benchmark datasets validate the superiority of the proposed \method{} compared to baselines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10000v1-abstract-full').style.display = 'none'; document.getElementById('2411.10000v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Wang%2C+Y&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Wang%2C+Y&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Wang%2C+Y&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Wang%2C+Y&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Wang%2C+Y&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Wang%2C+Y&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>