CINXE.COM
Search | arXiv e-print repository
<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 480 results for author: <span class="mathjax">Yan, X</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Yan%2C+X">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Yan, X"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Yan%2C+X&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Yan, X"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Yan%2C+X&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Yan%2C+X&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Yan%2C+X&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Yan%2C+X&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Yan%2C+X&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Yan%2C+X&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06419">arXiv:2502.06419</a> <span> [<a href="https://arxiv.org/pdf/2502.06419">pdf</a>, <a href="https://arxiv.org/format/2502.06419">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Occ-LLM: Enhancing Autonomous Driving with Occupancy-Based Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+T">Tianshuo Xu</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Hao Lu</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+X">Xu Yan</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+Y">Yingjie Cai</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+B">Bingbing Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yingcong Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06419v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) have made substantial advancements in the field of robotic and autonomous driving. This study presents the first Occupancy-based Large Language Model (Occ-LLM), which represents a pioneering effort to integrate LLMs with an important representation. To effectively encode occupancy as input for the LLM and address the category imbalances associated with occupancy, we pr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06419v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06419v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06419v1-abstract-full" style="display: none;"> Large Language Models (LLMs) have made substantial advancements in the field of robotic and autonomous driving. This study presents the first Occupancy-based Large Language Model (Occ-LLM), which represents a pioneering effort to integrate LLMs with an important representation. To effectively encode occupancy as input for the LLM and address the category imbalances associated with occupancy, we propose Motion Separation Variational Autoencoder (MS-VAE). This innovative approach utilizes prior knowledge to distinguish dynamic objects from static scenes before inputting them into a tailored Variational Autoencoder (VAE). This separation enhances the model's capacity to concentrate on dynamic trajectories while effectively reconstructing static scenes. The efficacy of Occ-LLM has been validated across key tasks, including 4D occupancy forecasting, self-ego planning, and occupancy-based scene question answering. Comprehensive evaluations demonstrate that Occ-LLM significantly surpasses existing state-of-the-art methodologies, achieving gains of about 6\% in Intersection over Union (IoU) and 4\% in mean Intersection over Union (mIoU) for the task of 4D occupancy forecasting. These findings highlight the transformative potential of Occ-LLM in reshaping current paradigms within robotic and autonomous driving. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06419v1-abstract-full').style.display = 'none'; document.getElementById('2502.06419v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted in 2025 IEEE International Conference on Robotics and Automation (ICRA)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.01705">arXiv:2502.01705</a> <span> [<a href="https://arxiv.org/pdf/2502.01705">pdf</a>, <a href="https://arxiv.org/format/2502.01705">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Progressive Binarization with Semi-Structured Pruning for LLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yan%2C+X">Xianglong Yan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tianao Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhiteng Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yulun Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.01705v2-abstract-short" style="display: inline;"> Large language models (LLMs) have achieved remarkable success in natural language processing tasks, but their high computational and memory demands pose challenges for deployment on resource-constrained devices. Binarization, as an efficient compression method that reduces model weights to just 1 bit, significantly lowers both computational and memory requirements. Despite this, the binarized LLM… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01705v2-abstract-full').style.display = 'inline'; document.getElementById('2502.01705v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.01705v2-abstract-full" style="display: none;"> Large language models (LLMs) have achieved remarkable success in natural language processing tasks, but their high computational and memory demands pose challenges for deployment on resource-constrained devices. Binarization, as an efficient compression method that reduces model weights to just 1 bit, significantly lowers both computational and memory requirements. Despite this, the binarized LLM still contains redundancy, which can be further compressed. Semi-structured pruning provides a promising approach to achieve this, which offers a better trade-off between model performance and hardware efficiency. However, simply combining binarization with semi-structured pruning can lead to a significant performance drop. To address this issue, we propose a Progressive Binarization with Semi-Structured Pruning (PBS$^2$P) method for LLM compression. We first propose a Stepwise semi-structured Pruning with Binarization Optimization (SPBO). Our optimization strategy significantly reduces the total error caused by pruning and binarization, even below that of the no-pruning scenario. Furthermore, we design a Coarse-to-Fine Search (CFS) method to select pruning elements more effectively. Extensive experiments demonstrate that PBS$^2$P achieves superior accuracy across various LLM families and evaluation metrics, noticeably outperforming state-of-the-art (SOTA) binary PTQ methods. The code and models will be available at https://github.com/XIANGLONGYAN/PBS2P. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01705v2-abstract-full').style.display = 'none'; document.getElementById('2502.01705v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.13087">arXiv:2501.13087</a> <span> [<a href="https://arxiv.org/pdf/2501.13087">pdf</a>, <a href="https://arxiv.org/format/2501.13087">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Orchid: Image Latent Diffusion for Joint Appearance and Geometry Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Krishnan%2C+A">Akshay Krishnan</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+X">Xinchen Yan</a>, <a href="/search/cs?searchtype=author&query=Casser%2C+V">Vincent Casser</a>, <a href="/search/cs?searchtype=author&query=Kundu%2C+A">Abhijit Kundu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.13087v1-abstract-short" style="display: inline;"> Diffusion models are state-of-the-art for image generation. Trained on large datasets, they capture expressive image priors that have been used for tasks like inpainting, depth, and (surface) normal prediction. However, these models are typically trained for one specific task, e.g., a separate model for each of color, depth, and normal prediction. Such models do not leverage the intrinsic correlat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13087v1-abstract-full').style.display = 'inline'; document.getElementById('2501.13087v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.13087v1-abstract-full" style="display: none;"> Diffusion models are state-of-the-art for image generation. Trained on large datasets, they capture expressive image priors that have been used for tasks like inpainting, depth, and (surface) normal prediction. However, these models are typically trained for one specific task, e.g., a separate model for each of color, depth, and normal prediction. Such models do not leverage the intrinsic correlation between appearance and geometry, often leading to inconsistent predictions. In this paper, we propose using a novel image diffusion prior that jointly encodes appearance and geometry. We introduce a diffusion model Orchid, comprising a Variational Autoencoder (VAE) to encode color, depth, and surface normals to a latent space, and a Latent Diffusion Model (LDM) for generating these joint latents. Orchid directly generates photo-realistic color images, relative depth, and surface normals from user-provided text, and can be used to create image-aligned partial 3D scenes seamlessly. It can also perform image-conditioned tasks like joint monocular depth and normal prediction and is competitive in accuracy to state-of-the-art methods designed for those tasks alone. Lastly, our model learns a joint prior that can be used zero-shot as a regularizer for many inverse problems that entangle appearance and geometry. For example, we demonstrate its effectiveness in color-depth-normal inpainting, showcasing its applicability to problems in 3D generation from sparse views. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13087v1-abstract-full').style.display = 'none'; document.getElementById('2501.13087v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project webpage: https://orchid3d.github.io</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.12157">arXiv:2501.12157</a> <span> [<a href="https://arxiv.org/pdf/2501.12157">pdf</a>, <a href="https://arxiv.org/format/2501.12157">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Fast-RF-Shimming: Accelerate RF Shimming in 7T MRI using Deep Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lu%2C+Z">Zhengyi Lu</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+H">Hao Liang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+M">Ming Lu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiao Wang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+X">Xinqiang Yan</a>, <a href="/search/cs?searchtype=author&query=Huo%2C+Y">Yuankai Huo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.12157v1-abstract-short" style="display: inline;"> Ultrahigh field (UHF) Magnetic Resonance Imaging (MRI) provides a high signal-to-noise ratio (SNR), enabling exceptional spatial resolution for clinical diagnostics and research. However, higher fields introduce challenges such as transmit radiofrequency (RF) field inhomogeneities, which result in uneven flip angles and image intensity artifacts. These artifacts degrade image quality and limit cli… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12157v1-abstract-full').style.display = 'inline'; document.getElementById('2501.12157v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.12157v1-abstract-full" style="display: none;"> Ultrahigh field (UHF) Magnetic Resonance Imaging (MRI) provides a high signal-to-noise ratio (SNR), enabling exceptional spatial resolution for clinical diagnostics and research. However, higher fields introduce challenges such as transmit radiofrequency (RF) field inhomogeneities, which result in uneven flip angles and image intensity artifacts. These artifacts degrade image quality and limit clinical adoption. Traditional RF shimming methods, including Magnitude Least Squares (MLS) optimization, mitigate RF field inhomogeneity but are time-intensive and often require the presence of the patient. Recent machine learning methods, such as RF Shim Prediction by Iteratively Projected Ridge Regression and other deep learning architectures, offer alternative approaches but face challenges such as extensive training requirements, limited complexity, and practical data constraints. This paper introduces a holistic learning-based framework called Fast RF Shimming, which achieves a 5000-fold speedup compared to MLS methods. First, random-initialized Adaptive Moment Estimation (Adam) derives reference shimming weights from multichannel RF fields. Next, a Residual Network (ResNet) maps RF fields to shimming outputs while incorporating a confidence parameter into the loss function. Finally, a Non-uniformity Field Detector (NFD) identifies extreme non-uniform outcomes. Comparative evaluations demonstrate significant improvements in both speed and predictive accuracy. The proposed pipeline also supports potential extensions, such as the integration of anatomical priors or multi-echo data, to enhance the robustness of RF field correction. This approach offers a faster and more efficient solution to RF shimming challenges in UHF MRI. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12157v1-abstract-full').style.display = 'none'; document.getElementById('2501.12157v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.10651">arXiv:2501.10651</a> <span> [<a href="https://arxiv.org/pdf/2501.10651">pdf</a>, <a href="https://arxiv.org/format/2501.10651">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Materials Science">cond-mat.mtrl-sci</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> MOFA: Discovering Materials for Carbon Capture with a GenAI- and Simulation-Based Workflow </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yan%2C+X">Xiaoli Yan</a>, <a href="/search/cs?searchtype=author&query=Hudson%2C+N">Nathaniel Hudson</a>, <a href="/search/cs?searchtype=author&query=Park%2C+H">Hyun Park</a>, <a href="/search/cs?searchtype=author&query=Grzenda%2C+D">Daniel Grzenda</a>, <a href="/search/cs?searchtype=author&query=Pauloski%2C+J+G">J. Gregory Pauloski</a>, <a href="/search/cs?searchtype=author&query=Schwarting%2C+M">Marcus Schwarting</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+H">Haochen Pan</a>, <a href="/search/cs?searchtype=author&query=Harb%2C+H">Hassan Harb</a>, <a href="/search/cs?searchtype=author&query=Foreman%2C+S">Samuel Foreman</a>, <a href="/search/cs?searchtype=author&query=Knight%2C+C">Chris Knight</a>, <a href="/search/cs?searchtype=author&query=Gibbs%2C+T">Tom Gibbs</a>, <a href="/search/cs?searchtype=author&query=Chard%2C+K">Kyle Chard</a>, <a href="/search/cs?searchtype=author&query=Chaudhuri%2C+S">Santanu Chaudhuri</a>, <a href="/search/cs?searchtype=author&query=Tajkhorshid%2C+E">Emad Tajkhorshid</a>, <a href="/search/cs?searchtype=author&query=Foster%2C+I">Ian Foster</a>, <a href="/search/cs?searchtype=author&query=Moosavi%2C+M">Mohamad Moosavi</a>, <a href="/search/cs?searchtype=author&query=Ward%2C+L">Logan Ward</a>, <a href="/search/cs?searchtype=author&query=Huerta%2C+E+A">E. A. Huerta</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.10651v1-abstract-short" style="display: inline;"> We present MOFA, an open-source generative AI (GenAI) plus simulation workflow for high-throughput generation of metal-organic frameworks (MOFs) on large-scale high-performance computing (HPC) systems. MOFA addresses key challenges in integrating GPU-accelerated computing for GPU-intensive GenAI tasks, including distributed training and inference, alongside CPU- and GPU-optimized tasks for screeni… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.10651v1-abstract-full').style.display = 'inline'; document.getElementById('2501.10651v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.10651v1-abstract-full" style="display: none;"> We present MOFA, an open-source generative AI (GenAI) plus simulation workflow for high-throughput generation of metal-organic frameworks (MOFs) on large-scale high-performance computing (HPC) systems. MOFA addresses key challenges in integrating GPU-accelerated computing for GPU-intensive GenAI tasks, including distributed training and inference, alongside CPU- and GPU-optimized tasks for screening and filtering AI-generated MOFs using molecular dynamics, density functional theory, and Monte Carlo simulations. These heterogeneous tasks are unified within an online learning framework that optimizes the utilization of available CPU and GPU resources across HPC systems. Performance metrics from a 450-node (14,400 AMD Zen 3 CPUs + 1800 NVIDIA A100 GPUs) supercomputer run demonstrate that MOFA achieves high-throughput generation of novel MOF structures, with CO$_2$ adsorption capacities ranking among the top 10 in the hypothetical MOF (hMOF) dataset. Furthermore, the production of high-quality MOFs exhibits a linear relationship with the number of nodes utilized. The modular architecture of MOFA will facilitate its integration into other scientific applications that dynamically combine GenAI with large-scale simulations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.10651v1-abstract-full').style.display = 'none'; document.getElementById('2501.10651v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 10 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.08458">arXiv:2501.08458</a> <span> [<a href="https://arxiv.org/pdf/2501.08458">pdf</a>, <a href="https://arxiv.org/format/2501.08458">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> RWKV-UNet: Improving UNet with Long-Range Cooperation for Effective Medical Image Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+J">Juntao Jiang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiangning Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+W">Weixuan Liu</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+M">Muxuan Gao</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+X">Xiaobin Hu</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+X">Xiaoxiao Yan</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+F">Feiyue Huang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yong Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.08458v1-abstract-short" style="display: inline;"> In recent years, there have been significant advancements in deep learning for medical image analysis, especially with convolutional neural networks (CNNs) and transformer models. However, CNNs face limitations in capturing long-range dependencies while transformers suffer high computational complexities. To address this, we propose RWKV-UNet, a novel model that integrates the RWKV (Receptance Wei… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.08458v1-abstract-full').style.display = 'inline'; document.getElementById('2501.08458v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.08458v1-abstract-full" style="display: none;"> In recent years, there have been significant advancements in deep learning for medical image analysis, especially with convolutional neural networks (CNNs) and transformer models. However, CNNs face limitations in capturing long-range dependencies while transformers suffer high computational complexities. To address this, we propose RWKV-UNet, a novel model that integrates the RWKV (Receptance Weighted Key Value) structure into the U-Net architecture. This integration enhances the model's ability to capture long-range dependencies and improve contextual understanding, which is crucial for accurate medical image segmentation. We build a strong encoder with developed inverted residual RWKV (IR-RWKV) blocks combining CNNs and RWKVs. We also propose a Cross-Channel Mix (CCM) module to improve skip connections with multi-scale feature fusion, achieving global channel information integration. Experiments on benchmark datasets, including Synapse, ACDC, BUSI, CVC-ClinicDB, CVC-ColonDB, Kvasir-SEG, ISIC 2017 and GLAS show that RWKV-UNet achieves state-of-the-art performance on various types of medical image segmentation. Additionally, smaller variants, RWKV-UNet-S and RWKV-UNet-T, balance accuracy and computational efficiency, making them suitable for broader clinical applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.08458v1-abstract-full').style.display = 'none'; document.getElementById('2501.08458v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.03916">arXiv:2501.03916</a> <span> [<a href="https://arxiv.org/pdf/2501.03916">pdf</a>, <a href="https://arxiv.org/format/2501.03916">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Dolphin: Closed-loop Open-ended Auto-research through Thinking, Practice, and Feedback </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yuan%2C+J">Jiakang Yuan</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+X">Xiangchao Yan</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+B">Botian Shi</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tao Chen</a>, <a href="/search/cs?searchtype=author&query=Ouyang%2C+W">Wanli Ouyang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Bo Zhang</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+L">Lei Bai</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+Y">Yu Qiao</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+B">Bowen Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.03916v2-abstract-short" style="display: inline;"> The scientific research paradigm is undergoing a profound transformation owing to the development of Artificial Intelligence (AI). Recent works demonstrate that various AI-assisted research methods can largely improve research efficiency by improving data analysis, accelerating computation, and fostering novel idea generation. To further move towards the ultimate goal (i.e., automatic scientific r… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03916v2-abstract-full').style.display = 'inline'; document.getElementById('2501.03916v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.03916v2-abstract-full" style="display: none;"> The scientific research paradigm is undergoing a profound transformation owing to the development of Artificial Intelligence (AI). Recent works demonstrate that various AI-assisted research methods can largely improve research efficiency by improving data analysis, accelerating computation, and fostering novel idea generation. To further move towards the ultimate goal (i.e., automatic scientific research), in this paper, we propose Dolphin, the first closed-loop open-ended auto-research framework to further build the entire process of human scientific research. Dolphin can generate research ideas, perform experiments, and get feedback from experimental results to generate higher-quality ideas. More specifically, Dolphin first generates novel ideas based on relevant papers which are ranked by the topic and task attributes. Then, the codes are automatically generated and debugged with the exception-traceback-guided local code structure. Finally, Dolphin automatically analyzes the results of each idea and feeds the results back to the next round of idea generation. Experiments are conducted on the benchmark datasets of different topics and results show that Dolphin can generate novel ideas continuously and complete the experiment in a loop. We highlight that Dolphin can automatically propose methods that are comparable to the state-of-the-art in some tasks such as 2D image classification and 3D point classification. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03916v2-abstract-full').style.display = 'none'; document.getElementById('2501.03916v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">19 pages, 11 figures, and our homepage: https://alpha-innovator.github.io/Dolphin-project-page</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.03001">arXiv:2501.03001</a> <span> [<a href="https://arxiv.org/pdf/2501.03001">pdf</a>, <a href="https://arxiv.org/format/2501.03001">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> </div> </div> <p class="title is-5 mathjax"> Approximating N-Player Nash Equilibrium through Gradient Descent </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+D">Dongge Wang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+X">Xiang Yan</a>, <a href="/search/cs?searchtype=author&query=Dou%2C+Z">Zehao Dou</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+W">Wenhan Huang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yaodong Yang</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+X">Xiaotie Deng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.03001v1-abstract-short" style="display: inline;"> Decoding how rational agents should behave in shared systems remains a critical challenge within theoretical computer science, artificial intelligence and economics studies. Central to this challenge is the task of computing the solution concept of games, which is Nash equilibrium (NE). Although computing NE in even two-player cases are known to be PPAD-hard, approximation solutions are of intensi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03001v1-abstract-full').style.display = 'inline'; document.getElementById('2501.03001v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.03001v1-abstract-full" style="display: none;"> Decoding how rational agents should behave in shared systems remains a critical challenge within theoretical computer science, artificial intelligence and economics studies. Central to this challenge is the task of computing the solution concept of games, which is Nash equilibrium (NE). Although computing NE in even two-player cases are known to be PPAD-hard, approximation solutions are of intensive interest in the machine learning domain. In this paper, we present a gradient-based approach to obtain approximate NE in N-player general-sum games. Specifically, we define a distance measure to an NE based on pure strategy best response, thereby computing an NE can be effectively transformed into finding the global minimum of this distance function through gradient descent. We prove that the proposed procedure converges to NE with rate $O(1/T)$ ($T$ is the number of iterations) when the utility function is convex. Experimental results suggest our method outperforms Tsaknakis-Spirakis algorithm, fictitious play and regret matching on various types of N-player normal-form games in GAMUT. In addition, our method demonstrates robust performance with increasing number of players and number of actions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03001v1-abstract-full').style.display = 'none'; document.getElementById('2501.03001v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.16985">arXiv:2412.16985</a> <span> [<a href="https://arxiv.org/pdf/2412.16985">pdf</a>, <a href="https://arxiv.org/format/2412.16985">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> BladeDISC++: Memory Optimizations Based On Symbolic Shape </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yuan%2C+X">Xiulong Yuan</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+X">Xu Yan</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+W">Wenting Shen</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+X">Xiafei Qiu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+A">Ang Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jie Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yong Li</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+W">Wei Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.16985v1-abstract-short" style="display: inline;"> Recent deep learning workloads exhibit dynamic characteristics, leading to the rising adoption of dynamic shape compilers. These compilers can generate efficient kernels for dynamic shape graphs characterized by a fixed graph topology and uncertain tensor shapes. However, memory optimization, although particularly crucial in this large model era, remains relatively underexplored for dynamic shape… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16985v1-abstract-full').style.display = 'inline'; document.getElementById('2412.16985v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.16985v1-abstract-full" style="display: none;"> Recent deep learning workloads exhibit dynamic characteristics, leading to the rising adoption of dynamic shape compilers. These compilers can generate efficient kernels for dynamic shape graphs characterized by a fixed graph topology and uncertain tensor shapes. However, memory optimization, although particularly crucial in this large model era, remains relatively underexplored for dynamic shape graphs. The fundamental challenge lies in the lack of precise tensor shapes which are essential in conventional methods such as operation scheduling(op scheduling) and rematerialization. To address this challenge, we propose op scheduling and rematerialization approaches based on symbolic shapes and developed BladeDISC++. Besides, since rematerialization decisions cannot be made solely at compile time when tensor shapes are unknown, BladeDISC++ employs a compilation-runtime combined strategy to optimally address shape dynamics. Evaluations indicate that BladeDISC++ effectively reduces memory usage for dynamic shape graphs, achieving memory consumption comparable to optimizations using precise shapes, thereby promoting the broader adoption of dynamic shape compilers. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16985v1-abstract-full').style.display = 'none'; document.getElementById('2412.16985v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> [1]"NeurIPS BladeDISC++: Memory Optimizations Based On Symbolic Shape" Neurips.cc, 2024. https://neurips.cc/virtual/2024/103601 (accessed Dec. 22, 2024) </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.13772">arXiv:2412.13772</a> <span> [<a href="https://arxiv.org/pdf/2412.13772">pdf</a>, <a href="https://arxiv.org/format/2412.13772">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> An Efficient Occupancy World Model via Decoupled Dynamic Flow and Image-assisted Training </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Haiming Zhang</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+Y">Ying Xue</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+X">Xu Yan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiacheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+W">Weichao Qiu</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+D">Dongfeng Bai</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+B">Bingbing Liu</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+S">Shuguang Cui</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhen Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.13772v1-abstract-short" style="display: inline;"> The field of autonomous driving is experiencing a surge of interest in world models, which aim to predict potential future scenarios based on historical observations. In this paper, we introduce DFIT-OccWorld, an efficient 3D occupancy world model that leverages decoupled dynamic flow and image-assisted training strategy, substantially improving 4D scene forecasting performance. To simplify the tr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.13772v1-abstract-full').style.display = 'inline'; document.getElementById('2412.13772v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.13772v1-abstract-full" style="display: none;"> The field of autonomous driving is experiencing a surge of interest in world models, which aim to predict potential future scenarios based on historical observations. In this paper, we introduce DFIT-OccWorld, an efficient 3D occupancy world model that leverages decoupled dynamic flow and image-assisted training strategy, substantially improving 4D scene forecasting performance. To simplify the training process, we discard the previous two-stage training strategy and innovatively reformulate the occupancy forecasting problem as a decoupled voxels warping process. Our model forecasts future dynamic voxels by warping existing observations using voxel flow, whereas static voxels are easily obtained through pose transformation. Moreover, our method incorporates an image-assisted training paradigm to enhance prediction reliability. Specifically, differentiable volume rendering is adopted to generate rendered depth maps through predicted future volumes, which are adopted in render-based photometric consistency. Experiments demonstrate the effectiveness of our approach, showcasing its state-of-the-art performance on the nuScenes and OpenScene benchmarks for 4D occupancy forecasting, end-to-end motion planning and point cloud forecasting. Concretely, it achieves state-of-the-art performances compared to existing 3D world models while incurring substantially lower computational costs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.13772v1-abstract-full').style.display = 'none'; document.getElementById('2412.13772v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.12686">arXiv:2412.12686</a> <span> [<a href="https://arxiv.org/pdf/2412.12686">pdf</a>, <a href="https://arxiv.org/format/2412.12686">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> XTransplant: A Probe into the Upper Bound Performance of Multilingual Capability and Culture Adaptability in LLMs via Mutual Cross-lingual Feed-forward Transplantation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ye%2C+Y">Yangfan Ye</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+X">Xiaocheng Feng</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+X">Xiachong Feng</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+L">Libo Qin</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yichong Huang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+L">Lei Huang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+W">Weitao Ma</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhirui Zhang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Y">Yunfei Lu</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+X">Xiaohui Yan</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+D">Duyu Tang</a>, <a href="/search/cs?searchtype=author&query=Tu%2C+D">Dandan Tu</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+B">Bing Qin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.12686v1-abstract-short" style="display: inline;"> Current large language models (LLMs) often exhibit imbalances in multilingual capabilities and cultural adaptability, largely due to their English-centric pretraining data. To address this imbalance, we propose a probing method named XTransplant that explores cross-lingual latent interactions via cross-lingual feed-forward transplantation during inference stage, with the hope of enabling the model… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.12686v1-abstract-full').style.display = 'inline'; document.getElementById('2412.12686v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.12686v1-abstract-full" style="display: none;"> Current large language models (LLMs) often exhibit imbalances in multilingual capabilities and cultural adaptability, largely due to their English-centric pretraining data. To address this imbalance, we propose a probing method named XTransplant that explores cross-lingual latent interactions via cross-lingual feed-forward transplantation during inference stage, with the hope of enabling the model to leverage the strengths of both English and non-English languages. Through extensive pilot experiments, we empirically prove that both the multilingual capabilities and cultural adaptability of LLMs hold the potential to be significantly improved by XTransplant, respectively from En -> non-En and non-En -> En, highlighting the underutilization of current LLMs' multilingual potential. And the patterns observed in these pilot experiments further motivate an offline scaling inference strategy, which demonstrates consistent performance improvements in multilingual and culture-aware tasks, sometimes even surpassing multilingual supervised fine-tuning. And we do hope our further analysis and discussion could help gain deeper insights into XTransplant mechanism. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.12686v1-abstract-full').style.display = 'none'; document.getElementById('2412.12686v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.11863">arXiv:2412.11863</a> <span> [<a href="https://arxiv.org/pdf/2412.11863">pdf</a>, <a href="https://arxiv.org/format/2412.11863">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> GeoX: Geometric Problem Solving Through Unified Formalized Vision-Language Pre-training </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xia%2C+R">Renqiu Xia</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Mingsheng Li</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+H">Hancheng Ye</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+W">Wenjie Wu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+H">Hongbin Zhou</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+J">Jiakang Yuan</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+T">Tianshuo Peng</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+X">Xinyu Cai</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+X">Xiangchao Yan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bin Wang</a>, <a href="/search/cs?searchtype=author&query=He%2C+C">Conghui He</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+B">Botian Shi</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tao Chen</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+J">Junchi Yan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Bo Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.11863v2-abstract-short" style="display: inline;"> Despite their proficiency in general tasks, Multi-modal Large Language Models (MLLMs) struggle with automatic Geometry Problem Solving (GPS), which demands understanding diagrams, interpreting symbols, and performing complex reasoning. This limitation arises from their pre-training on natural images and texts, along with the lack of automated verification in the problem-solving process. Besides, c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.11863v2-abstract-full').style.display = 'inline'; document.getElementById('2412.11863v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.11863v2-abstract-full" style="display: none;"> Despite their proficiency in general tasks, Multi-modal Large Language Models (MLLMs) struggle with automatic Geometry Problem Solving (GPS), which demands understanding diagrams, interpreting symbols, and performing complex reasoning. This limitation arises from their pre-training on natural images and texts, along with the lack of automated verification in the problem-solving process. Besides, current geometric specialists are limited by their task-specific designs, making them less effective for broader geometric problems. To this end, we present GeoX, a multi-modal large model focusing on geometric understanding and reasoning tasks. Given the significant differences between geometric diagram-symbol and natural image-text, we introduce unimodal pre-training to develop a diagram encoder and symbol decoder, enhancing the understanding of geometric images and corpora. Furthermore, we introduce geometry-language alignment, an effective pre-training paradigm that bridges the modality gap between unimodal geometric experts. We propose a Generator-And-Sampler Transformer (GS-Former) to generate discriminative queries and eliminate uninformative representations from unevenly distributed geometric signals. Finally, GeoX benefits from visual instruction tuning, empowering it to take geometric images and questions as input and generate verifiable solutions. Experiments show that GeoX outperforms both generalists and geometric specialists on publicly recognized benchmarks, such as GeoQA, UniGeo, Geometry3K, and PGPS9k. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.11863v2-abstract-full').style.display = 'none'; document.getElementById('2412.11863v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Our code is available at https://github.com/Alpha-Innovator/GeoX</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.11551">arXiv:2412.11551</a> <span> [<a href="https://arxiv.org/pdf/2412.11551">pdf</a>, <a href="https://arxiv.org/format/2412.11551">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Region-Based Optimization in Continual Learning for Audio Deepfake Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yujie Chen</a>, <a href="/search/cs?searchtype=author&query=Yi%2C+J">Jiangyan Yi</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+C">Cunhang Fan</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+J">Jianhua Tao</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+Y">Yong Ren</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+S">Siding Zeng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C+Y">Chu Yuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+X">Xinrui Yan</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+H">Hao Gu</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+J">Jun Xue</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chenglong Wang</a>, <a href="/search/cs?searchtype=author&query=Lv%2C+Z">Zhao Lv</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiaohui Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.11551v1-abstract-short" style="display: inline;"> Rapid advancements in speech synthesis and voice conversion bring convenience but also new security risks, creating an urgent need for effective audio deepfake detection. Although current models perform well, their effectiveness diminishes when confronted with the diverse and evolving nature of real-world deepfakes. To address this issue, we propose a continual learning method named Region-Based O… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.11551v1-abstract-full').style.display = 'inline'; document.getElementById('2412.11551v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.11551v1-abstract-full" style="display: none;"> Rapid advancements in speech synthesis and voice conversion bring convenience but also new security risks, creating an urgent need for effective audio deepfake detection. Although current models perform well, their effectiveness diminishes when confronted with the diverse and evolving nature of real-world deepfakes. To address this issue, we propose a continual learning method named Region-Based Optimization (RegO) for audio deepfake detection. Specifically, we use the Fisher information matrix to measure important neuron regions for real and fake audio detection, dividing them into four regions. First, we directly fine-tune the less important regions to quickly adapt to new tasks. Next, we apply gradient optimization in parallel for regions important only to real audio detection, and in orthogonal directions for regions important only to fake audio detection. For regions that are important to both, we use sample proportion-based adaptive gradient optimization. This region-adaptive optimization ensures an appropriate trade-off between memory stability and learning plasticity. Additionally, to address the increase of redundant neurons from old tasks, we further introduce the Ebbinghaus forgetting mechanism to release them, thereby promoting the capability of the model to learn more generalized discriminative features. Experimental results show our method achieves a 21.3% improvement in EER over the state-of-the-art continual learning approach RWM for audio deepfake detection. Moreover, the effectiveness of RegO extends beyond the audio deepfake detection domain, showing potential significance in other tasks, such as image recognition. The code is available at https://github.com/cyjie429/RegO <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.11551v1-abstract-full').style.display = 'none'; document.getElementById('2412.11551v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by AAAI 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.04741">arXiv:2412.04741</a> <span> [<a href="https://arxiv.org/pdf/2412.04741">pdf</a>, <a href="https://arxiv.org/format/2412.04741">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Question Answering for Decisionmaking in Green Building Design: A Multimodal Data Reasoning Method Driven by Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yihui Li</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+X">Xiaoyue Yan</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+H">Hao Zhou</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+B">Borong Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.04741v1-abstract-short" style="display: inline;"> In recent years, the critical role of green buildings in addressing energy consumption and environmental issues has become widely acknowledged. Research indicates that over 40% of potential energy savings can be achieved during the early design stage. Therefore, decision-making in green building design (DGBD), which is based on modeling and performance simulation, is crucial for reducing building… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.04741v1-abstract-full').style.display = 'inline'; document.getElementById('2412.04741v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.04741v1-abstract-full" style="display: none;"> In recent years, the critical role of green buildings in addressing energy consumption and environmental issues has become widely acknowledged. Research indicates that over 40% of potential energy savings can be achieved during the early design stage. Therefore, decision-making in green building design (DGBD), which is based on modeling and performance simulation, is crucial for reducing building energy costs. However, the field of green building encompasses a broad range of specialized knowledge, which involves significant learning costs and results in low decision-making efficiency. Many studies have already applied artificial intelligence (AI) methods to this field. Based on previous research, this study innovatively integrates large language models with DGBD, creating GreenQA, a question answering framework for multimodal data reasoning. Utilizing Retrieval Augmented Generation, Chain of Thought, and Function Call methods, GreenQA enables multimodal question answering, including weather data analysis and visualization, retrieval of green building cases, and knowledge query. Additionally, this study conducted a user survey using the GreenQA web platform. The results showed that 96% of users believed the platform helped improve design efficiency. This study not only effectively supports DGBD but also provides inspiration for AI-assisted design. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.04741v1-abstract-full').style.display = 'none'; document.getElementById('2412.04741v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published at Association for Computer Aided Design in Architecture (ACADIA) 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.01425">arXiv:2412.01425</a> <span> [<a href="https://arxiv.org/pdf/2412.01425">pdf</a>, <a href="https://arxiv.org/format/2412.01425">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Reject Threshold Adaptation for Open-Set Model Attribution of Deepfake Audio </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yan%2C+X">Xinrui Yan</a>, <a href="/search/cs?searchtype=author&query=Yi%2C+J">Jiangyan Yi</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+J">Jianhua Tao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yujie Chen</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+H">Hao Gu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Guanjun Li</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Junzuo Zhou</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+Y">Yong Ren</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+T">Tao Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.01425v1-abstract-short" style="display: inline;"> Open environment oriented open set model attribution of deepfake audio is an emerging research topic, aiming to identify the generation models of deepfake audio. Most previous work requires manually setting a rejection threshold for unknown classes to compare with predicted probabilities. However, models often overfit training instances and generate overly confident predictions. Moreover, threshol… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.01425v1-abstract-full').style.display = 'inline'; document.getElementById('2412.01425v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.01425v1-abstract-full" style="display: none;"> Open environment oriented open set model attribution of deepfake audio is an emerging research topic, aiming to identify the generation models of deepfake audio. Most previous work requires manually setting a rejection threshold for unknown classes to compare with predicted probabilities. However, models often overfit training instances and generate overly confident predictions. Moreover, thresholds that effectively distinguish unknown categories in the current dataset may not be suitable for identifying known and unknown categories in another data distribution. To address the issues, we propose a novel framework for open set model attribution of deepfake audio with rejection threshold adaptation (ReTA). Specifically, the reconstruction error learning module trains by combining the representation of system fingerprints with labels corresponding to either the target class or a randomly chosen other class label. This process generates matching and non-matching reconstructed samples, establishing the reconstruction error distributions for each class and laying the foundation for the reject threshold calculation module. The reject threshold calculation module utilizes gaussian probability estimation to fit the distributions of matching and non-matching reconstruction errors. It then computes adaptive reject thresholds for all classes through probability minimization criteria. The experimental results demonstrate the effectiveness of ReTA in improving the open set model attributes of deepfake audio. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.01425v1-abstract-full').style.display = 'none'; document.getElementById('2412.01425v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ISCSLP 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.01316">arXiv:2412.01316</a> <span> [<a href="https://arxiv.org/pdf/2412.01316">pdf</a>, <a href="https://arxiv.org/format/2412.01316">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> Long Video Diffusion Generation with Segmented Cross-Attention and Content-Rich Video Data Curation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yan%2C+X">Xin Yan</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+Y">Yuxuan Cai</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qiuyue Wang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yuan Zhou</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+W">Wenhao Huang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+H">Huan Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.01316v1-abstract-short" style="display: inline;"> We introduce Presto, a novel video diffusion model designed to generate 15-second videos with long-range coherence and rich content. Extending video generation methods to maintain scenario diversity over long durations presents significant challenges. To address this, we propose a Segmented Cross-Attention (SCA) strategy, which splits hidden states into segments along the temporal dimension, allow… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.01316v1-abstract-full').style.display = 'inline'; document.getElementById('2412.01316v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.01316v1-abstract-full" style="display: none;"> We introduce Presto, a novel video diffusion model designed to generate 15-second videos with long-range coherence and rich content. Extending video generation methods to maintain scenario diversity over long durations presents significant challenges. To address this, we propose a Segmented Cross-Attention (SCA) strategy, which splits hidden states into segments along the temporal dimension, allowing each segment to cross-attend to a corresponding sub-caption. SCA requires no additional parameters, enabling seamless incorporation into current DiT-based architectures. To facilitate high-quality long video generation, we build the LongTake-HD dataset, consisting of 261k content-rich videos with scenario coherence, annotated with an overall video caption and five progressive sub-captions. Experiments show that our Presto achieves 78.5% on the VBench Semantic Score and 100% on the Dynamic Degree, outperforming existing state-of-the-art video generation methods. This demonstrates that our proposed Presto significantly enhances content richness, maintains long-range coherence, and captures intricate textual details. More details are displayed on our project page: https://presto-video.github.io/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.01316v1-abstract-full').style.display = 'none'; document.getElementById('2412.01316v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.19545">arXiv:2411.19545</a> <span> [<a href="https://arxiv.org/pdf/2411.19545">pdf</a>, <a href="https://arxiv.org/format/2411.19545">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> A Unified Interaction Control Framework for Safe Robotic Ultrasound Scanning with Human-Intention-Aware Compliance </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yan%2C+X">Xiangjie Yan</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+S">Shaqi Luo</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Y">Yongpeng Jiang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+M">Mingrui Yu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C">Chen Chen</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+S">Senqiang Zhu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+G">Gao Huang</a>, <a href="/search/cs?searchtype=author&query=Song%2C+S">Shiji Song</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiang Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.19545v1-abstract-short" style="display: inline;"> The ultrasound scanning robot operates in environments where frequent human-robot interactions occur. Most existing control methods for ultrasound scanning address only one specific interaction situation or implement hard switches between controllers for different situations, which compromises both safety and efficiency. In this paper, we propose a unified interaction control framework for ultraso… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.19545v1-abstract-full').style.display = 'inline'; document.getElementById('2411.19545v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.19545v1-abstract-full" style="display: none;"> The ultrasound scanning robot operates in environments where frequent human-robot interactions occur. Most existing control methods for ultrasound scanning address only one specific interaction situation or implement hard switches between controllers for different situations, which compromises both safety and efficiency. In this paper, we propose a unified interaction control framework for ultrasound scanning robots capable of handling all common interactions, distinguishing both human-intended and unintended types, and adapting with appropriate compliance. Specifically, the robot suspends or modulates its ongoing main task if the interaction is intended, e.g., when the doctor grasps the robot to lead the end effector actively. Furthermore, it can identify unintended interactions and avoid potential collision in the null space beforehand. Even if that collision has happened, it can become compliant with the collision in the null space and try to reduce its impact on the main task (where the scan is ongoing) kinematically and dynamically. The multiple situations are integrated into a unified controller with a smooth transition to deal with the interactions by exhibiting human-intention-aware compliance. Experimental results validate the framework's ability to cope with all common interactions including intended intervention and unintended collision in a collaborative carotid artery ultrasound scanning task. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.19545v1-abstract-full').style.display = 'none'; document.getElementById('2411.19545v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.18850">arXiv:2411.18850</a> <span> [<a href="https://arxiv.org/pdf/2411.18850">pdf</a>, <a href="https://arxiv.org/format/2411.18850">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> CrossTracker: Robust Multi-modal 3D Multi-Object Tracking via Cross Correction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gu%2C+L">Lipeng Gu</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+X">Xuefeng Yan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Weiming Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Honghua Chen</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+D">Dingkun Zhu</a>, <a href="/search/cs?searchtype=author&query=Nan%2C+L">Liangliang Nan</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+M">Mingqiang Wei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.18850v1-abstract-short" style="display: inline;"> The fusion of camera- and LiDAR-based detections offers a promising solution to mitigate tracking failures in 3D multi-object tracking (MOT). However, existing methods predominantly exploit camera detections to correct tracking failures caused by potential LiDAR detection problems, neglecting the reciprocal benefit of refining camera detections using LiDAR data. This limitation is rooted in their… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18850v1-abstract-full').style.display = 'inline'; document.getElementById('2411.18850v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.18850v1-abstract-full" style="display: none;"> The fusion of camera- and LiDAR-based detections offers a promising solution to mitigate tracking failures in 3D multi-object tracking (MOT). However, existing methods predominantly exploit camera detections to correct tracking failures caused by potential LiDAR detection problems, neglecting the reciprocal benefit of refining camera detections using LiDAR data. This limitation is rooted in their single-stage architecture, akin to single-stage object detectors, lacking a dedicated trajectory refinement module to fully exploit the complementary multi-modal information. To this end, we introduce CrossTracker, a novel two-stage paradigm for online multi-modal 3D MOT. CrossTracker operates in a coarse-to-fine manner, initially generating coarse trajectories and subsequently refining them through an independent refinement process. Specifically, CrossTracker incorporates three essential modules: i) a multi-modal modeling (M^3) module that, by fusing multi-modal information (images, point clouds, and even plane geometry extracted from images), provides a robust metric for subsequent trajectory generation. ii) a coarse trajectory generation (C-TG) module that generates initial coarse dual-stream trajectories, and iii) a trajectory refinement (TR) module that refines coarse trajectories through cross correction between camera and LiDAR streams. Comprehensive experiments demonstrate the superior performance of our CrossTracker over its eighteen competitors, underscoring its effectiveness in harnessing the synergistic benefits of camera and LiDAR sensors for robust multi-modal 3D MOT. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18850v1-abstract-full').style.display = 'none'; document.getElementById('2411.18850v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.17027">arXiv:2411.17027</a> <span> [<a href="https://arxiv.org/pdf/2411.17027">pdf</a>, <a href="https://arxiv.org/format/2411.17027">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> D$^2$-World: An Efficient World Model through Decoupled Dynamic Flow </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Haiming Zhang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+X">Xu Yan</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+Y">Ying Xue</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Z">Zixuan Guo</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+S">Shuguang Cui</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhen Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+B">Bingbing Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.17027v1-abstract-short" style="display: inline;"> This technical report summarizes the second-place solution for the Predictive World Model Challenge held at the CVPR-2024 Workshop on Foundation Models for Autonomous Systems. We introduce D$^2$-World, a novel World model that effectively forecasts future point clouds through Decoupled Dynamic flow. Specifically, the past semantic occupancies are obtained via existing occupancy networks (e.g., BEV… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17027v1-abstract-full').style.display = 'inline'; document.getElementById('2411.17027v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.17027v1-abstract-full" style="display: none;"> This technical report summarizes the second-place solution for the Predictive World Model Challenge held at the CVPR-2024 Workshop on Foundation Models for Autonomous Systems. We introduce D$^2$-World, a novel World model that effectively forecasts future point clouds through Decoupled Dynamic flow. Specifically, the past semantic occupancies are obtained via existing occupancy networks (e.g., BEVDet). Following this, the occupancy results serve as the input for a single-stage world model, generating future occupancy in a non-autoregressive manner. To further simplify the task, dynamic voxel decoupling is performed in the world model. The model generates future dynamic voxels by warping the existing observations through voxel flow, while remaining static voxels can be easily obtained through pose transformation. As a result, our approach achieves state-of-the-art performance on the OpenScene Predictive World Model benchmark, securing second place, and trains more than 300% faster than the baseline model. Code is available at https://github.com/zhanghm1995/D2-World. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17027v1-abstract-full').style.display = 'none'; document.getElementById('2411.17027v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The 2nd Place and Innovation Award Solution of Predictive World Model at the CVPR 2024 Autonomous Grand Challenge</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14716">arXiv:2411.14716</a> <span> [<a href="https://arxiv.org/pdf/2411.14716">pdf</a>, <a href="https://arxiv.org/format/2411.14716">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> VisionPAD: A Vision-Centric Pre-training Paradigm for Autonomous Driving </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Haiming Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+W">Wending Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yiyao Zhu</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+X">Xu Yan</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+J">Jiantao Gao</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+D">Dongfeng Bai</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+Y">Yingjie Cai</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+B">Bingbing Liu</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+S">Shuguang Cui</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhen Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14716v1-abstract-short" style="display: inline;"> This paper introduces VisionPAD, a novel self-supervised pre-training paradigm designed for vision-centric algorithms in autonomous driving. In contrast to previous approaches that employ neural rendering with explicit depth supervision, VisionPAD utilizes more efficient 3D Gaussian Splatting to reconstruct multi-view representations using only images as supervision. Specifically, we introduce a s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14716v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14716v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14716v1-abstract-full" style="display: none;"> This paper introduces VisionPAD, a novel self-supervised pre-training paradigm designed for vision-centric algorithms in autonomous driving. In contrast to previous approaches that employ neural rendering with explicit depth supervision, VisionPAD utilizes more efficient 3D Gaussian Splatting to reconstruct multi-view representations using only images as supervision. Specifically, we introduce a self-supervised method for voxel velocity estimation. By warping voxels to adjacent frames and supervising the rendered outputs, the model effectively learns motion cues in the sequential data. Furthermore, we adopt a multi-frame photometric consistency approach to enhance geometric perception. It projects adjacent frames to the current frame based on rendered depths and relative poses, boosting the 3D geometric representation through pure image supervision. Extensive experiments on autonomous driving datasets demonstrate that VisionPAD significantly improves performance in 3D object detection, occupancy prediction and map segmentation, surpassing state-of-the-art pre-training strategies by a considerable margin. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14716v1-abstract-full').style.display = 'none'; document.getElementById('2411.14716v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12584">arXiv:2411.12584</a> <span> [<a href="https://arxiv.org/pdf/2411.12584">pdf</a>, <a href="https://arxiv.org/format/2411.12584">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Leveraging MLLM Embeddings and Attribute Smoothing for Compositional Zero-Shot Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yan%2C+X">Xudong Yan</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+S">Songhe Feng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yang Zhang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jian Yang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yueguan Lin</a>, <a href="/search/cs?searchtype=author&query=Fei%2C+H">Haojun Fei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12584v1-abstract-short" style="display: inline;"> Compositional zero-shot learning (CZSL) aims to recognize novel compositions of attributes and objects learned from seen compositions. Previous works disentangle attribute and object by extracting shared and exclusive parts between image pairs sharing the same attribute (object), as well as aligning them with pretrained word embeddings to improve unseen attribute-object recognition. Despite the si… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12584v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12584v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12584v1-abstract-full" style="display: none;"> Compositional zero-shot learning (CZSL) aims to recognize novel compositions of attributes and objects learned from seen compositions. Previous works disentangle attribute and object by extracting shared and exclusive parts between image pairs sharing the same attribute (object), as well as aligning them with pretrained word embeddings to improve unseen attribute-object recognition. Despite the significant achievements of existing efforts, they are hampered by three limitations: (1) the efficacy of disentanglement is compromised due to the influence of the background and the intricate entanglement of attribute with object in the same parts. (2) existing word embeddings fail to capture complex multimodal semantic information. (3) overconfidence exhibited by existing models in seen compositions hinders their generalization to novel compositions. Being aware of these, we propose a novel framework named Multimodal Large Language Model (MLLM) embeddings and attribute smoothing guided disentanglement (TRIDENT) for CZSL. First, we leverage feature adaptive aggregation modules to mitigate the impact of background, and utilize learnable condition masks to capture multigranularity features for disentanglement. Then, the last hidden states of MLLM are employed as word embeddings for their superior representation capabilities. Moreover, we propose attribute smoothing with auxiliary attributes generated by Large Language Model (LLM) for seen compositions, addressing the issue of overconfidence by encouraging the model to learn more attributes in one given composition. Extensive experiments demonstrate that TRIDENT achieves state-of-the-art performance on three benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12584v1-abstract-full').style.display = 'none'; document.getElementById('2411.12584v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.08544">arXiv:2411.08544</a> <span> [<a href="https://arxiv.org/pdf/2411.08544">pdf</a>, <a href="https://arxiv.org/format/2411.08544">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Deeper Insights into Learning Performance of Stochastic Configuration Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yan%2C+X">Xiufeng Yan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+D">Dianhui Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.08544v1-abstract-short" style="display: inline;"> Stochastic Configuration Networks (SCNs) are a class of randomized neural networks that integrate randomized algorithms within an incremental learning framework. A defining feature of SCNs is the supervisory mechanism, which adaptively adjusts the distribution to generate effective random basis functions, thereby enabling error-free learning. In this paper, we present a comprehensive analysis of t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08544v1-abstract-full').style.display = 'inline'; document.getElementById('2411.08544v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.08544v1-abstract-full" style="display: none;"> Stochastic Configuration Networks (SCNs) are a class of randomized neural networks that integrate randomized algorithms within an incremental learning framework. A defining feature of SCNs is the supervisory mechanism, which adaptively adjusts the distribution to generate effective random basis functions, thereby enabling error-free learning. In this paper, we present a comprehensive analysis of the impact of the supervisory mechanism on the learning performance of SCNs. Our findings reveal that the current SCN framework evaluates the effectiveness of each random basis function in reducing residual errors using a lower bound on its error reduction potential, which constrains SCNs' overall learning efficiency. Specifically, SCNs may fail to consistently select the most effective random candidate as the new basis function during each training iteration. To overcome this problem, we propose a novel method for evaluating the hidden layer's output matrix, supported by a new supervisory mechanism that accurately assesses the error reduction potential of random basis functions without requiring the computation of the Moore-Penrose inverse of the output matrix. This approach enhances the selection of basis functions, reducing computational complexity and improving the overall scalability and learning capabilities of SCNs. We introduce a Recursive Moore-Penrose Inverse-SCN (RMPI-SCN) training scheme based on the new supervisory mechanism and demonstrate its effectiveness through simulations over some benchmark datasets. Experiments show that RMPI-SCN outperforms the conventional SCN in terms of learning capability, underscoring its potential to advance the SCN framework for large-scale data modeling applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08544v1-abstract-full').style.display = 'none'; document.getElementById('2411.08544v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06041">arXiv:2411.06041</a> <span> [<a href="https://arxiv.org/pdf/2411.06041">pdf</a>, <a href="https://arxiv.org/format/2411.06041">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> PointCG: Self-supervised Point Cloud Learning via Joint Completion and Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yun Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+P">Peng Li</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+X">Xuefeng Yan</a>, <a href="/search/cs?searchtype=author&query=Nan%2C+L">Liangliang Nan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bing Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Honghua Chen</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+L">Lina Gong</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+W">Wei Zhao</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+M">Mingqiang Wei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06041v1-abstract-short" style="display: inline;"> The core of self-supervised point cloud learning lies in setting up appropriate pretext tasks, to construct a pre-training framework that enables the encoder to perceive 3D objects effectively. In this paper, we integrate two prevalent methods, masked point modeling (MPM) and 3D-to-2D generation, as pretext tasks within a pre-training framework. We leverage the spatial awareness and precise superv… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06041v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06041v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06041v1-abstract-full" style="display: none;"> The core of self-supervised point cloud learning lies in setting up appropriate pretext tasks, to construct a pre-training framework that enables the encoder to perceive 3D objects effectively. In this paper, we integrate two prevalent methods, masked point modeling (MPM) and 3D-to-2D generation, as pretext tasks within a pre-training framework. We leverage the spatial awareness and precise supervision offered by these two methods to address their respective limitations: ambiguous supervision signals and insensitivity to geometric information. Specifically, the proposed framework, abbreviated as PointCG, consists of a Hidden Point Completion (HPC) module and an Arbitrary-view Image Generation (AIG) module. We first capture visible points from arbitrary views as inputs by removing hidden points. Then, HPC extracts representations of the inputs with an encoder and completes the entire shape with a decoder, while AIG is used to generate rendered images based on the visible points' representations. Extensive experiments demonstrate the superiority of the proposed method over the baselines in various downstream tasks. Our code will be made available upon acceptance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06041v1-abstract-full').style.display = 'none'; document.getElementById('2411.06041v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00787">arXiv:2411.00787</a> <span> [<a href="https://arxiv.org/pdf/2411.00787">pdf</a>, <a href="https://arxiv.org/format/2411.00787">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> </div> </div> <p class="title is-5 mathjax"> Novel operational algorithms for ride-pooling as on-demand feeder services </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fan%2C+W">Wenbo Fan</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+X">Xiaotian Yan</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Z">Zhanbo Sun</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xiaohui Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00787v1-abstract-short" style="display: inline;"> Ride-pooling (RP) service, as a form of shared mobility, enables multiple riders with similar itineraries to share the same vehicle and split the fee. This makes RP a promising on-demand feeder service for patrons with a common trip end in urban transportation. We propose the RP as Feeder (RPaF) services with tailored operational algorithms. Specifically, we have developed (i) a batch-based matchi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00787v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00787v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00787v1-abstract-full" style="display: none;"> Ride-pooling (RP) service, as a form of shared mobility, enables multiple riders with similar itineraries to share the same vehicle and split the fee. This makes RP a promising on-demand feeder service for patrons with a common trip end in urban transportation. We propose the RP as Feeder (RPaF) services with tailored operational algorithms. Specifically, we have developed (i) a batch-based matching algorithm that pools a batch of requests within an optimized buffer distance to each RP vehicle; (ii) a dispatching algorithm that adaptively dispatches vehicles to pick up the matched requests for certain occupancy target; and (iii) a repositioning algorithm that relocates vehicles to unmatched requests based on their level of urgency. An agent-based microscopic simulation platform is designed to execute these operational algorithms (via the Operator module), generate spatially distributed random requests (Patron module), and account for traffic conditions (Vehicle module) in street networks. Extensive numerical experiments are conducted to showcase the effectiveness of RPaF services across various demand scenarios in typical morning rush hours. We compare RFaF with two on-demand feeder counterparts proposed in previous studies: Ride-Sharing as Feeder (RSaF) and Flexible-Route Feeder-Bus Transit (Flex-FBT). Comparisons reveal that given the same fleet size, RPaF generally outperforms RSaF in higher service rates (i.e., the percentage of requests served over all requests) and Flex-FBT in shorter average trip times for patrons. Lastly, we illustrate the implementation of RPaF in a real-world case study of the uptown Manhattan network (USA) using actual taxi trip data. The results demonstrate that RPaF effectively balances the level of service (service rate and patrons' average trip time) with operational costs (fleet size). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00787v1-abstract-full').style.display = 'none'; document.getElementById('2411.00787v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.17617">arXiv:2410.17617</a> <span> [<a href="https://arxiv.org/pdf/2410.17617">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Self-Supervised Graph Neural Networks for Enhanced Feature Extraction in Heterogeneous Information Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wei%2C+J">Jianjun Wei</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yue Liu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+X">Xin Huang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xin Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+W">Wenyi Liu</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+X">Xu Yan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.17617v1-abstract-short" style="display: inline;"> This paper explores the applications and challenges of graph neural networks (GNNs) in processing complex graph data brought about by the rapid development of the Internet. Given the heterogeneity and redundancy problems that graph data often have, traditional GNN methods may be overly dependent on the initial structure and attribute information of the graph, which limits their ability to accurate… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17617v1-abstract-full').style.display = 'inline'; document.getElementById('2410.17617v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.17617v1-abstract-full" style="display: none;"> This paper explores the applications and challenges of graph neural networks (GNNs) in processing complex graph data brought about by the rapid development of the Internet. Given the heterogeneity and redundancy problems that graph data often have, traditional GNN methods may be overly dependent on the initial structure and attribute information of the graph, which limits their ability to accurately simulate more complex relationships and patterns in the graph. Therefore, this study proposes a graph neural network model under a self-supervised learning framework, which can flexibly combine different types of additional information of the attribute graph and its nodes, so as to better mine the deep features in the graph data. By introducing a self-supervisory mechanism, it is expected to improve the adaptability of existing models to the diversity and complexity of graph data and improve the overall performance of the model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17617v1-abstract-full').style.display = 'none'; document.getElementById('2410.17617v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.09873">arXiv:2410.09873</a> <span> [<a href="https://arxiv.org/pdf/2410.09873">pdf</a>, <a href="https://arxiv.org/format/2410.09873">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Training-Free Adaptive Diffusion with Bounded Difference Approximation Strategy </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ye%2C+H">Hancheng Ye</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+J">Jiakang Yuan</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+R">Renqiu Xia</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+X">Xiangchao Yan</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tao Chen</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+J">Junchi Yan</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+B">Botian Shi</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Bo Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.09873v1-abstract-short" style="display: inline;"> Diffusion models have recently achieved great success in the synthesis of high-quality images and videos. However, the existing denoising techniques in diffusion models are commonly based on step-by-step noise predictions, which suffers from high computation cost, resulting in a prohibitive latency for interactive applications. In this paper, we propose AdaptiveDiffusion to relieve this bottleneck… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09873v1-abstract-full').style.display = 'inline'; document.getElementById('2410.09873v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.09873v1-abstract-full" style="display: none;"> Diffusion models have recently achieved great success in the synthesis of high-quality images and videos. However, the existing denoising techniques in diffusion models are commonly based on step-by-step noise predictions, which suffers from high computation cost, resulting in a prohibitive latency for interactive applications. In this paper, we propose AdaptiveDiffusion to relieve this bottleneck by adaptively reducing the noise prediction steps during the denoising process. Our method considers the potential of skipping as many noise prediction steps as possible while keeping the final denoised results identical to the original full-step ones. Specifically, the skipping strategy is guided by the third-order latent difference that indicates the stability between timesteps during the denoising process, which benefits the reusing of previous noise prediction results. Extensive experiments on image and video diffusion models demonstrate that our method can significantly speed up the denoising process while generating identical results to the original process, achieving up to an average 2~5x speedup without quality degradation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09873v1-abstract-full').style.display = 'none'; document.getElementById('2410.09873v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by NeurIPS 2024, Homepage: https://jiakangyuan.github.io/AdaptiveDiffusion-project-page/ The code is available at https://github.com/UniModal4Reasoning/AdaptiveDiffusion</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.08410">arXiv:2410.08410</a> <span> [<a href="https://arxiv.org/pdf/2410.08410">pdf</a>, <a href="https://arxiv.org/format/2410.08410">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Human Stone Toolmaking Action Grammar (HSTAG): A Challenging Benchmark for Fine-grained Motor Behavior Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+C">Cheng Liu</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+X">Xuyang Yan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zekun Zhang</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+C">Cheng Ding</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+T">Tianhao Zhao</a>, <a href="/search/cs?searchtype=author&query=Jannati%2C+S">Shaya Jannati</a>, <a href="/search/cs?searchtype=author&query=Martinez%2C+C">Cynthia Martinez</a>, <a href="/search/cs?searchtype=author&query=Stout%2C+D">Dietrich Stout</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.08410v1-abstract-short" style="display: inline;"> Action recognition has witnessed the development of a growing number of novel algorithms and datasets in the past decade. However, the majority of public benchmarks were constructed around activities of daily living and annotated at a rather coarse-grained level, which lacks diversity in domain-specific datasets, especially for rarely seen domains. In this paper, we introduced Human Stone Toolmaki… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08410v1-abstract-full').style.display = 'inline'; document.getElementById('2410.08410v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.08410v1-abstract-full" style="display: none;"> Action recognition has witnessed the development of a growing number of novel algorithms and datasets in the past decade. However, the majority of public benchmarks were constructed around activities of daily living and annotated at a rather coarse-grained level, which lacks diversity in domain-specific datasets, especially for rarely seen domains. In this paper, we introduced Human Stone Toolmaking Action Grammar (HSTAG), a meticulously annotated video dataset showcasing previously undocumented stone toolmaking behaviors, which can be used for investigating the applications of advanced artificial intelligence techniques in understanding a rapid succession of complex interactions between two hand-held objects. HSTAG consists of 18,739 video clips that record 4.5 hours of experts' activities in stone toolmaking. Its unique features include (i) brief action durations and frequent transitions, mirroring the rapid changes inherent in many motor behaviors; (ii) multiple angles of view and switches among multiple tools, increasing intra-class variability; (iii) unbalanced class distributions and high similarity among different action sequences, adding difficulty in capturing distinct patterns for each action. Several mainstream action recognition models are used to conduct experimental analysis, which showcases the challenges and uniqueness of HSTAG https://nyu.databrary.org/volume/1697. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08410v1-abstract-full').style.display = 'none'; document.getElementById('2410.08410v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 4 figures, accepted by the 11th IEEE International Conference on Data Science and Advanced Analytics (DSAA)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.07927">arXiv:2410.07927</a> <span> [<a href="https://arxiv.org/pdf/2410.07927">pdf</a>, <a href="https://arxiv.org/format/2410.07927">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Efficient Reinforcement Learning with Large Language Model Priors </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yan%2C+X">Xue Yan</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yan Song</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+X">Xidong Feng</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+M">Mengyue Yang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Haifeng Zhang</a>, <a href="/search/cs?searchtype=author&query=Ammar%2C+H+B">Haitham Bou Ammar</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jun Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.07927v1-abstract-short" style="display: inline;"> In sequential decision-making (SDM) tasks, methods like reinforcement learning (RL) and heuristic search have made notable advances in specific cases. However, they often require extensive exploration and face challenges in generalizing across diverse environments due to their limited grasp of the underlying decision dynamics. In contrast, large language models (LLMs) have recently emerged as powe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07927v1-abstract-full').style.display = 'inline'; document.getElementById('2410.07927v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.07927v1-abstract-full" style="display: none;"> In sequential decision-making (SDM) tasks, methods like reinforcement learning (RL) and heuristic search have made notable advances in specific cases. However, they often require extensive exploration and face challenges in generalizing across diverse environments due to their limited grasp of the underlying decision dynamics. In contrast, large language models (LLMs) have recently emerged as powerful general-purpose tools, due to their capacity to maintain vast amounts of domain-specific knowledge. To harness this rich prior knowledge for efficiently solving complex SDM tasks, we propose treating LLMs as prior action distributions and integrating them into RL frameworks through Bayesian inference methods, making use of variational inference and direct posterior sampling. The proposed approaches facilitate the seamless incorporation of fixed LLM priors into both policy-based and value-based RL frameworks. Our experiments show that incorporating LLM-based action priors significantly reduces exploration and optimization complexity, substantially improving sample efficiency compared to traditional RL techniques, e.g., using LLM priors decreases the number of required samples by over 90% in offline learning scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07927v1-abstract-full').style.display = 'none'; document.getElementById('2410.07927v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.04932">arXiv:2410.04932</a> <span> [<a href="https://arxiv.org/pdf/2410.04932">pdf</a>, <a href="https://arxiv.org/format/2410.04932">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> OmniBooth: Learning Latent Control for Image Synthesis with Multi-modal Instruction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+L">Leheng Li</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+W">Weichao Qiu</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+X">Xu Yan</a>, <a href="/search/cs?searchtype=author&query=He%2C+J">Jing He</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+K">Kaiqiang Zhou</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+Y">Yingjie Cai</a>, <a href="/search/cs?searchtype=author&query=Lian%2C+Q">Qing Lian</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+B">Bingbing Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Ying-Cong Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.04932v1-abstract-short" style="display: inline;"> We present OmniBooth, an image generation framework that enables spatial control with instance-level multi-modal customization. For all instances, the multimodal instruction can be described through text prompts or image references. Given a set of user-defined masks and associated text or image guidance, our objective is to generate an image, where multiple objects are positioned at specified coor… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04932v1-abstract-full').style.display = 'inline'; document.getElementById('2410.04932v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.04932v1-abstract-full" style="display: none;"> We present OmniBooth, an image generation framework that enables spatial control with instance-level multi-modal customization. For all instances, the multimodal instruction can be described through text prompts or image references. Given a set of user-defined masks and associated text or image guidance, our objective is to generate an image, where multiple objects are positioned at specified coordinates and their attributes are precisely aligned with the corresponding guidance. This approach significantly expands the scope of text-to-image generation, and elevates it to a more versatile and practical dimension in controllability. In this paper, our core contribution lies in the proposed latent control signals, a high-dimensional spatial feature that provides a unified representation to integrate the spatial, textual, and image conditions seamlessly. The text condition extends ControlNet to provide instance-level open-vocabulary generation. The image condition further enables fine-grained control with personalized identity. In practice, our method empowers users with more flexibility in controllable generation, as users can choose multi-modal conditions from text or images as needed. Furthermore, thorough experiments demonstrate our enhanced performance in image synthesis fidelity and alignment across different tasks and datasets. Project page: https://len-li.github.io/omnibooth-web/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04932v1-abstract-full').style.display = 'none'; document.getElementById('2410.04932v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.03129">arXiv:2410.03129</a> <span> [<a href="https://arxiv.org/pdf/2410.03129">pdf</a>, <a href="https://arxiv.org/format/2410.03129">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> ARB-LLM: Alternating Refined Binarizations for Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhiteng Li</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+X">Xianglong Yan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tianao Zhang</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+H">Haotong Qin</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+D">Dong Xie</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+J">Jiang Tian</a>, <a href="/search/cs?searchtype=author&query=shi%2C+z">zhongchao shi</a>, <a href="/search/cs?searchtype=author&query=Kong%2C+L">Linghe Kong</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yulun Zhang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xiaokang Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.03129v2-abstract-short" style="display: inline;"> Large Language Models (LLMs) have greatly pushed forward advancements in natural language processing, yet their high memory and computational demands hinder practical deployment. Binarization, as an effective compression technique, can shrink model weights to just 1 bit, significantly reducing the high demands on computation and memory. However, current binarization methods struggle to narrow the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03129v2-abstract-full').style.display = 'inline'; document.getElementById('2410.03129v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.03129v2-abstract-full" style="display: none;"> Large Language Models (LLMs) have greatly pushed forward advancements in natural language processing, yet their high memory and computational demands hinder practical deployment. Binarization, as an effective compression technique, can shrink model weights to just 1 bit, significantly reducing the high demands on computation and memory. However, current binarization methods struggle to narrow the distribution gap between binarized and full-precision weights, while also overlooking the column deviation in LLM weight distribution. To tackle these issues, we propose ARB-LLM, a novel 1-bit post-training quantization (PTQ) technique tailored for LLMs. To narrow the distribution shift between binarized and full-precision weights, we first design an alternating refined binarization (ARB) algorithm to progressively update the binarization parameters, which significantly reduces the quantization error. Moreover, considering the pivot role of calibration data and the column deviation in LLM weights, we further extend ARB to ARB-X and ARB-RC. In addition, we refine the weight partition strategy with column-group bitmap (CGB), which further enhance performance. Equipping ARB-X and ARB-RC with CGB, we obtain ARB-LLM$_\text{X}$ and ARB-LLM$_\text{RC}$ respectively, which significantly outperform state-of-the-art (SOTA) binarization methods for LLMs. As a binary PTQ method, our ARB-LLM$_\text{RC}$ is the first to surpass FP16 models of the same size. The code and models will be available at https://github.com/ZHITENGLI/ARB-LLM. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03129v2-abstract-full').style.display = 'none'; document.getElementById('2410.03129v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The code and models will be available at https://github.com/ZHITENGLI/ARB-LLM</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.00988">arXiv:2410.00988</a> <span> [<a href="https://arxiv.org/pdf/2410.00988">pdf</a>, <a href="https://arxiv.org/format/2410.00988">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Creative and Context-Aware Translation of East Asian Idioms with GPT-4 </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tang%2C+K">Kenan Tang</a>, <a href="/search/cs?searchtype=author&query=Song%2C+P">Peiyang Song</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+Y">Yao Qin</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+X">Xifeng Yan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.00988v1-abstract-short" style="display: inline;"> As a type of figurative language, an East Asian idiom condenses rich cultural background into only a few characters. Translating such idioms is challenging for human translators, who often resort to choosing a context-aware translation from an existing list of candidates. However, compiling a dictionary of candidate translations demands much time and creativity even for expert translators. To alle… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00988v1-abstract-full').style.display = 'inline'; document.getElementById('2410.00988v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.00988v1-abstract-full" style="display: none;"> As a type of figurative language, an East Asian idiom condenses rich cultural background into only a few characters. Translating such idioms is challenging for human translators, who often resort to choosing a context-aware translation from an existing list of candidates. However, compiling a dictionary of candidate translations demands much time and creativity even for expert translators. To alleviate such burden, we evaluate if GPT-4 can help generate high-quality translations. Based on automatic evaluations of faithfulness and creativity, we first identify Pareto-optimal prompting strategies that can outperform translation engines from Google and DeepL. Then, at a low cost, our context-aware translations can achieve far more high-quality translations per idiom than the human baseline. We open-source all code and data to facilitate further research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00988v1-abstract-full').style.display = 'none'; document.getElementById('2410.00988v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.00337">arXiv:2410.00337</a> <span> [<a href="https://arxiv.org/pdf/2410.00337">pdf</a>, <a href="https://arxiv.org/format/2410.00337">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SyntheOcc: Synthesize Geometric-Controlled Street View Images through 3D Semantic MPIs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+L">Leheng Li</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+W">Weichao Qiu</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+Y">Yingjie Cai</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+X">Xu Yan</a>, <a href="/search/cs?searchtype=author&query=Lian%2C+Q">Qing Lian</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+B">Bingbing Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Ying-Cong Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.00337v1-abstract-short" style="display: inline;"> The advancement of autonomous driving is increasingly reliant on high-quality annotated datasets, especially in the task of 3D occupancy prediction, where the occupancy labels require dense 3D annotation with significant human effort. In this paper, we propose SyntheOcc, which denotes a diffusion model that Synthesize photorealistic and geometric-controlled images by conditioning Occupancy labels… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00337v1-abstract-full').style.display = 'inline'; document.getElementById('2410.00337v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.00337v1-abstract-full" style="display: none;"> The advancement of autonomous driving is increasingly reliant on high-quality annotated datasets, especially in the task of 3D occupancy prediction, where the occupancy labels require dense 3D annotation with significant human effort. In this paper, we propose SyntheOcc, which denotes a diffusion model that Synthesize photorealistic and geometric-controlled images by conditioning Occupancy labels in driving scenarios. This yields an unlimited amount of diverse, annotated, and controllable datasets for applications like training perception models and simulation. SyntheOcc addresses the critical challenge of how to efficiently encode 3D geometric information as conditional input to a 2D diffusion model. Our approach innovatively incorporates 3D semantic multi-plane images (MPIs) to provide comprehensive and spatially aligned 3D scene descriptions for conditioning. As a result, SyntheOcc can generate photorealistic multi-view images and videos that faithfully align with the given geometric labels (semantics in 3D voxel space). Extensive qualitative and quantitative evaluations of SyntheOcc on the nuScenes dataset prove its effectiveness in generating controllable occupancy datasets that serve as an effective data augmentation to perception models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00337v1-abstract-full').style.display = 'none'; document.getElementById('2410.00337v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.16807">arXiv:2409.16807</a> <span> [<a href="https://arxiv.org/pdf/2409.16807">pdf</a>, <a href="https://arxiv.org/format/2409.16807">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> A Few Hypocrites: Few-Shot Learning and Subtype Definitions for Detecting Hypocrisy Accusations in Online Climate Change Debates </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Corral%2C+P+G">Paulina Garcia Corral</a>, <a href="/search/cs?searchtype=author&query=Green%2C+A">Avishai Green</a>, <a href="/search/cs?searchtype=author&query=Meyer%2C+H">Hendrik Meyer</a>, <a href="/search/cs?searchtype=author&query=Stoll%2C+A">Anke Stoll</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+X">Xiaoyue Yan</a>, <a href="/search/cs?searchtype=author&query=Reuver%2C+M">Myrthe Reuver</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.16807v1-abstract-short" style="display: inline;"> The climate crisis is a salient issue in online discussions, and hypocrisy accusations are a central rhetorical element in these debates. However, for large-scale text analysis, hypocrisy accusation detection is an understudied tool, most often defined as a smaller subtask of fallacious argument detection. In this paper, we define hypocrisy accusation detection as an independent task in NLP, and i… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.16807v1-abstract-full').style.display = 'inline'; document.getElementById('2409.16807v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.16807v1-abstract-full" style="display: none;"> The climate crisis is a salient issue in online discussions, and hypocrisy accusations are a central rhetorical element in these debates. However, for large-scale text analysis, hypocrisy accusation detection is an understudied tool, most often defined as a smaller subtask of fallacious argument detection. In this paper, we define hypocrisy accusation detection as an independent task in NLP, and identify different relevant subtypes of hypocrisy accusations. Our Climate Hypocrisy Accusation Corpus (CHAC) consists of 420 Reddit climate debate comments, expert-annotated into two different types of hypocrisy accusations: personal versus political hypocrisy. We evaluate few-shot in-context learning with 6 shots and 3 instruction-tuned Large Language Models (LLMs) for detecting hypocrisy accusations in this dataset. Results indicate that the GPT-4o and Llama-3 models in particular show promise in detecting hypocrisy accusations (F1 reaching 0.68, while previous work shows F1 of 0.44). However, context matters for a complex semantic concept such as hypocrisy accusations, and we find models struggle especially at identifying political hypocrisy accusations compared to personal moral hypocrisy. Our study contributes new insights in hypocrisy detection and climate change discourse, and is a stepping stone for large-scale analysis of hypocrisy accusation in online climate debates. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.16807v1-abstract-full').style.display = 'none'; document.getElementById('2409.16807v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">cite the public version, published at CPSS 2024 @ KONVENS</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.15045">arXiv:2409.15045</a> <span> [<a href="https://arxiv.org/pdf/2409.15045">pdf</a>, <a href="https://arxiv.org/format/2409.15045">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> AIM 2024 Sparse Neural Rendering Challenge: Methods and Results </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Nazarczuk%2C+M">Michal Nazarczuk</a>, <a href="/search/cs?searchtype=author&query=Catley-Chandar%2C+S">Sibi Catley-Chandar</a>, <a href="/search/cs?searchtype=author&query=Tanay%2C+T">Thomas Tanay</a>, <a href="/search/cs?searchtype=author&query=Shaw%2C+R">Richard Shaw</a>, <a href="/search/cs?searchtype=author&query=P%C3%A9rez-Pellitero%2C+E">Eduardo P茅rez-Pellitero</a>, <a href="/search/cs?searchtype=author&query=Timofte%2C+R">Radu Timofte</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+X">Xing Yan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+P">Pan Wang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Y">Yali Guo</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yongxin Wu</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+Y">Youcheng Cai</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yanan Yang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Junting Li</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yanghong Zhou</a>, <a href="/search/cs?searchtype=author&query=Mok%2C+P+Y">P. Y. Mok</a>, <a href="/search/cs?searchtype=author&query=He%2C+Z">Zongqi He</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+Z">Zhe Xiao</a>, <a href="/search/cs?searchtype=author&query=Chan%2C+K">Kin-Chung Chan</a>, <a href="/search/cs?searchtype=author&query=Goshu%2C+H+L">Hana Lebeta Goshu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+C">Cuixin Yang</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+R">Rongkang Dong</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+J">Jun Xiao</a>, <a href="/search/cs?searchtype=author&query=Lam%2C+K">Kin-Man Lam</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+J">Jiayao Hao</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Q">Qiong Gao</a> , et al. (5 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.15045v1-abstract-short" style="display: inline;"> This paper reviews the challenge on Sparse Neural Rendering that was part of the Advances in Image Manipulation (AIM) workshop, held in conjunction with ECCV 2024. This manuscript focuses on the competition set-up, the proposed methods and their respective results. The challenge aims at producing novel camera view synthesis of diverse scenes from sparse image observations. It is composed of two tr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15045v1-abstract-full').style.display = 'inline'; document.getElementById('2409.15045v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.15045v1-abstract-full" style="display: none;"> This paper reviews the challenge on Sparse Neural Rendering that was part of the Advances in Image Manipulation (AIM) workshop, held in conjunction with ECCV 2024. This manuscript focuses on the competition set-up, the proposed methods and their respective results. The challenge aims at producing novel camera view synthesis of diverse scenes from sparse image observations. It is composed of two tracks, with differing levels of sparsity; 3 views in Track 1 (very sparse) and 9 views in Track 2 (sparse). Participants are asked to optimise objective fidelity to the ground-truth images as measured via the Peak Signal-to-Noise Ratio (PSNR) metric. For both tracks, we use the newly introduced Sparse Rendering (SpaRe) dataset and the popular DTU MVS dataset. In this challenge, 5 teams submitted final results to Track 1 and 4 teams submitted final results to Track 2. The submitted models are varied and push the boundaries of the current state-of-the-art in sparse neural rendering. A detailed description of all models developed in the challenge is provided in this paper. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15045v1-abstract-full').style.display = 'none'; document.getElementById('2409.15045v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Part of Advances in Image Manipulation workshop at ECCV 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.14327">arXiv:2409.14327</a> <span> [<a href="https://arxiv.org/pdf/2409.14327">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Transforming Multidimensional Time Series into Interpretable Event Sequences for Advanced Data Mining </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yan%2C+X">Xu Yan</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Y">Yaoting Jiang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+W">Wenyi Liu</a>, <a href="/search/cs?searchtype=author&query=Yi%2C+D">Didi Yi</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+J">Jianjun Wei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.14327v2-abstract-short" style="display: inline;"> This paper introduces a novel spatiotemporal feature representation model designed to address the limitations of traditional methods in multidimensional time series (MTS) analysis. The proposed approach converts MTS into one-dimensional sequences of spatially evolving events, preserving the complex coupling relationships between dimensions. By employing a variable-length tuple mining method, key s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.14327v2-abstract-full').style.display = 'inline'; document.getElementById('2409.14327v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.14327v2-abstract-full" style="display: none;"> This paper introduces a novel spatiotemporal feature representation model designed to address the limitations of traditional methods in multidimensional time series (MTS) analysis. The proposed approach converts MTS into one-dimensional sequences of spatially evolving events, preserving the complex coupling relationships between dimensions. By employing a variable-length tuple mining method, key spatiotemporal features are extracted, enhancing the interpretability and accuracy of time series analysis. Unlike conventional models, this unsupervised method does not rely on large training datasets, making it adaptable across different domains. Experimental results from motion sequence classification validate the model's superior performance in capturing intricate patterns within the data. The proposed framework has significant potential for applications across various fields, including backend services for monitoring and optimizing IT infrastructure, medical diagnosis through continuous patient monitoring and health trend analysis, and internet businesses for tracking user behavior and forecasting sales. This work offers a new theoretical foundation and technical support for advancing time series data mining and its practical applications in human behavior recognition and other domains. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.14327v2-abstract-full').style.display = 'none'; document.getElementById('2409.14327v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.05477">arXiv:2409.05477</a> <span> [<a href="https://arxiv.org/pdf/2409.05477">pdf</a>, <a href="https://arxiv.org/format/2409.05477">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Retrofitting Temporal Graph Neural Networks with Transformer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+Q">Qiang Huang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+X">Xiao Yan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xin Wang</a>, <a href="/search/cs?searchtype=author&query=Rao%2C+S+X">Susie Xi Rao</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zhichao Han</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+F">Fangcheng Fu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wentao Zhang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+J">Jiawei Jiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.05477v3-abstract-short" style="display: inline;"> Temporal graph neural networks (TGNNs) outperform regular GNNs by incorporating time information into graph-based operations. However, TGNNs adopt specialized models (e.g., TGN, TGAT, and APAN ) and require tailored training frameworks (e.g., TGL and ETC). In this paper, we propose TF-TGN, which uses Transformer decoder as the backbone model for TGNN to enjoy Transformer's codebase for efficient t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.05477v3-abstract-full').style.display = 'inline'; document.getElementById('2409.05477v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.05477v3-abstract-full" style="display: none;"> Temporal graph neural networks (TGNNs) outperform regular GNNs by incorporating time information into graph-based operations. However, TGNNs adopt specialized models (e.g., TGN, TGAT, and APAN ) and require tailored training frameworks (e.g., TGL and ETC). In this paper, we propose TF-TGN, which uses Transformer decoder as the backbone model for TGNN to enjoy Transformer's codebase for efficient training. In particular, Transformer achieves tremendous success for language modeling, and thus the community developed high-performance kernels (e.g., flash-attention and memory-efficient attention) and efficient distributed training schemes (e.g., PyTorch FSDP, DeepSpeed, and Megatron-LM). We observe that TGNN resembles language modeling, i.e., the message aggregation operation between chronologically occurring nodes and their temporal neighbors in TGNNs can be structured as sequence modeling. Beside this similarity, we also incorporate a series of algorithm designs including suffix infilling, temporal graph attention with self-loop, and causal masking self-attention to make TF-TGN work. During training, existing systems are slow in transforming the graph topology and conducting graph sampling. As such, we propose methods to parallelize the CSR format conversion and graph sampling. We also adapt Transformer codebase to train TF-TGN efficiently with multiple GPUs. We experiment with 9 graphs and compare with 2 state-of-the-art TGNN training frameworks. The results show that TF-TGN can accelerate training by over 2.20 while providing comparable or even superior accuracy to existing SOTA TGNNs. TF-TGN is available at https://github.com/qianghuangwhu/TF-TGN. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.05477v3-abstract-full').style.display = 'none'; document.getElementById('2409.05477v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">conference Under review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.03512">arXiv:2409.03512</a> <span> [<a href="https://arxiv.org/pdf/2409.03512">pdf</a>, <a href="https://arxiv.org/format/2409.03512">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> From MOOC to MAIC: Reshaping Online Teaching and Learning through LLM-driven Agents </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+J">Jifan Yu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zheyuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang-li%2C+D">Daniel Zhang-li</a>, <a href="/search/cs?searchtype=author&query=Tu%2C+S">Shangqing Tu</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+Z">Zhanxin Hao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+R+M">Rui Miao Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Haoxuan Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yuanchun Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Hanming Li</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+L">Linlu Gong</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+J">Jie Cao</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+J">Jiayin Lin</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jinchang Zhou</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+F">Fei Qin</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haohua Wang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+J">Jianxiao Jiang</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+L">Lijun Deng</a>, <a href="/search/cs?searchtype=author&query=Zhan%2C+Y">Yisi Zhan</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+C">Chaojun Xiao</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+X">Xusheng Dai</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+X">Xuan Yan</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+N">Nianyi Lin</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+N">Nan Zhang</a>, <a href="/search/cs?searchtype=author&query=Ni%2C+R">Ruixin Ni</a>, <a href="/search/cs?searchtype=author&query=Dang%2C+Y">Yang Dang</a> , et al. (8 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.03512v1-abstract-short" style="display: inline;"> Since the first instances of online education, where courses were uploaded to accessible and shared online platforms, this form of scaling the dissemination of human knowledge to reach a broader audience has sparked extensive discussion and widespread adoption. Recognizing that personalized learning still holds significant potential for improvement, new AI technologies have been continuously integ… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.03512v1-abstract-full').style.display = 'inline'; document.getElementById('2409.03512v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.03512v1-abstract-full" style="display: none;"> Since the first instances of online education, where courses were uploaded to accessible and shared online platforms, this form of scaling the dissemination of human knowledge to reach a broader audience has sparked extensive discussion and widespread adoption. Recognizing that personalized learning still holds significant potential for improvement, new AI technologies have been continuously integrated into this learning format, resulting in a variety of educational AI applications such as educational recommendation and intelligent tutoring. The emergence of intelligence in large language models (LLMs) has allowed for these educational enhancements to be built upon a unified foundational model, enabling deeper integration. In this context, we propose MAIC (Massive AI-empowered Course), a new form of online education that leverages LLM-driven multi-agent systems to construct an AI-augmented classroom, balancing scalability with adaptivity. Beyond exploring the conceptual framework and technical innovations, we conduct preliminary experiments at Tsinghua University, one of China's leading universities. Drawing from over 100,000 learning records of more than 500 students, we obtain a series of valuable observations and initial analyses. This project will continue to evolve, ultimately aiming to establish a comprehensive open platform that supports and unifies research, technology, and applications in exploring the possibilities of online education in the era of large model AI. We envision this platform as a collaborative hub, bringing together educators, researchers, and innovators to collectively explore the future of AI-driven online education. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.03512v1-abstract-full').style.display = 'none'; document.getElementById('2409.03512v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.00727">arXiv:2409.00727</a> <span> [<a href="https://arxiv.org/pdf/2409.00727">pdf</a>, <a href="https://arxiv.org/format/2409.00727">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Hound: Hunting Supervision Signals for Few and Zero Shot Node Classification on Text-attributed Graph </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yuxiang Wang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+X">Xiao Yan</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+S">Shiyu Jin</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Q">Quanqing Xu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+C">Chuanhui Yang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yuanyuan Zhu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+C">Chuang Hu</a>, <a href="/search/cs?searchtype=author&query=Du%2C+B">Bo Du</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+J">Jiawei Jiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.00727v1-abstract-short" style="display: inline;"> Text-attributed graph (TAG) is an important type of graph structured data with text descriptions for each node. Few- and zero-shot node classification on TAGs have many applications in fields such as academia and social networks. However, the two tasks are challenging due to the lack of supervision signals, and existing methods only use the contrastive loss to align graph-based node embedding and… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00727v1-abstract-full').style.display = 'inline'; document.getElementById('2409.00727v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.00727v1-abstract-full" style="display: none;"> Text-attributed graph (TAG) is an important type of graph structured data with text descriptions for each node. Few- and zero-shot node classification on TAGs have many applications in fields such as academia and social networks. However, the two tasks are challenging due to the lack of supervision signals, and existing methods only use the contrastive loss to align graph-based node embedding and language-based text embedding. In this paper, we propose Hound to improve accuracy by introducing more supervision signals, and the core idea is to go beyond the node-text pairs that come with data. Specifically, we design three augmentation techniques, i.e., node perturbation, text matching, and semantics negation to provide more reference nodes for each text and vice versa. Node perturbation adds/drops edges to produce diversified node embeddings that can be matched with a text. Text matching retrieves texts with similar embeddings to match with a node. Semantics negation uses a negative prompt to construct a negative text with the opposite semantics, which is contrasted with the original node and text. We evaluate Hound on 5 datasets and compare with 13 state-of-the-art baselines. The results show that Hound consistently outperforms all baselines, and its accuracy improvements over the best-performing baseline are usually over 5%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00727v1-abstract-full').style.display = 'none'; document.getElementById('2409.00727v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.00618">arXiv:2409.00618</a> <span> [<a href="https://arxiv.org/pdf/2409.00618">pdf</a>, <a href="https://arxiv.org/format/2409.00618">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> YOLOO: You Only Learn from Others Once </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gu%2C+L">Lipeng Gu</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+M">Mingqiang Wei</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+X">Xuefeng Yan</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+D">Dingkun Zhu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+W">Wei Zhao</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+H">Haoran Xie</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yong-Jin Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.00618v1-abstract-short" style="display: inline;"> Multi-modal 3D multi-object tracking (MOT) typically necessitates extensive computational costs of deep neural networks (DNNs) to extract multi-modal representations. In this paper, we propose an intriguing question: May we learn from multiple modalities only during training to avoid multi-modal input in the inference phase? To answer it, we propose \textbf{YOLOO}, a novel multi-modal 3D MOT parad… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00618v1-abstract-full').style.display = 'inline'; document.getElementById('2409.00618v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.00618v1-abstract-full" style="display: none;"> Multi-modal 3D multi-object tracking (MOT) typically necessitates extensive computational costs of deep neural networks (DNNs) to extract multi-modal representations. In this paper, we propose an intriguing question: May we learn from multiple modalities only during training to avoid multi-modal input in the inference phase? To answer it, we propose \textbf{YOLOO}, a novel multi-modal 3D MOT paradigm: You Only Learn from Others Once. YOLOO empowers the point cloud encoder to learn a unified tri-modal representation (UTR) from point clouds and other modalities, such as images and textual cues, all at once. Leveraging this UTR, YOLOO achieves efficient tracking solely using the point cloud encoder without compromising its performance, fundamentally obviating the need for computationally intensive DNNs. Specifically, YOLOO includes two core components: a unified tri-modal encoder (UTEnc) and a flexible geometric constraint (F-GC) module. UTEnc integrates a point cloud encoder with image and text encoders adapted from pre-trained CLIP. It seamlessly fuses point cloud information with rich visual-textual knowledge from CLIP into the point cloud encoder, yielding highly discriminative UTRs that facilitate the association between trajectories and detections. Additionally, F-GC filters out mismatched associations with similar representations but significant positional discrepancies. It further enhances the robustness of UTRs without requiring any scene-specific tuning, addressing a key limitation of customized geometric constraints (e.g., 3D IoU). Lastly, high-quality 3D trajectories are generated by a traditional data association component. By integrating these advancements into a multi-modal 3D MOT scheme, our YOLOO achieves substantial gains in both robustness and efficiency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00618v1-abstract-full').style.display = 'none'; document.getElementById('2409.00618v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.17009">arXiv:2408.17009</a> <span> [<a href="https://arxiv.org/pdf/2408.17009">pdf</a>, <a href="https://arxiv.org/format/2408.17009">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Utilizing Speaker Profiles for Impersonation Audio Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gu%2C+H">Hao Gu</a>, <a href="/search/cs?searchtype=author&query=Yi%2C+J">JiangYan Yi</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chenglong Wang</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+Y">Yong Ren</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+J">Jianhua Tao</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+X">Xinrui Yan</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yujie Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiaohui Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.17009v1-abstract-short" style="display: inline;"> Fake audio detection is an emerging active topic. A growing number of literatures have aimed to detect fake utterance, which are mostly generated by Text-to-speech (TTS) or voice conversion (VC). However, countermeasures against impersonation remain an underexplored area. Impersonation is a fake type that involves an imitator replicating specific traits and speech style of a target speaker. Unlike… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.17009v1-abstract-full').style.display = 'inline'; document.getElementById('2408.17009v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.17009v1-abstract-full" style="display: none;"> Fake audio detection is an emerging active topic. A growing number of literatures have aimed to detect fake utterance, which are mostly generated by Text-to-speech (TTS) or voice conversion (VC). However, countermeasures against impersonation remain an underexplored area. Impersonation is a fake type that involves an imitator replicating specific traits and speech style of a target speaker. Unlike TTS and VC, which often leave digital traces or signal artifacts, impersonation involves live human beings producing entirely natural speech, rendering the detection of impersonation audio a challenging task. Thus, we propose a novel method that integrates speaker profiles into the process of impersonation audio detection. Speaker profiles are inherent characteristics that are challenging for impersonators to mimic accurately, such as speaker's age, job. We aim to leverage these features to extract discriminative information for detecting impersonation audio. Moreover, there is no large impersonated speech corpora available for quantitative study of impersonation impacts. To address this gap, we further design the first large-scale, diverse-speaker Chinese impersonation dataset, named ImPersonation Audio Detection (IPAD), to advance the community's research on impersonation audio detection. We evaluate several existing fake audio detection methods on our proposed dataset IPAD, demonstrating its necessity and the challenges. Additionally, our findings reveal that incorporating speaker profiles can significantly enhance the model's performance in detecting impersonation audio. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.17009v1-abstract-full').style.display = 'none'; document.getElementById('2408.17009v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ACM MM2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.16094">arXiv:2408.16094</a> <span> [<a href="https://arxiv.org/pdf/2408.16094">pdf</a>, <a href="https://arxiv.org/ps/2408.16094">ps</a>, <a href="https://arxiv.org/format/2408.16094">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> Monadring: A lightweight consensus protocol to offer Validation-as-a-Service to AVS nodes </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yu Zhang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+X">Xiao Yan</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+G">Gang Tang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Helena Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.16094v1-abstract-short" style="display: inline;"> Existing blockchain networks are often large-scale, requiring transactions to be synchronized across the entire network to reach consensus. On-chain computations can be prohibitively expensive, making many CPU-intensive computations infeasible. Inspired by the structure of IBM's token ring networks, we propose a lightweight consensus protocol called Monadring to address these issues. Monadring all… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.16094v1-abstract-full').style.display = 'inline'; document.getElementById('2408.16094v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.16094v1-abstract-full" style="display: none;"> Existing blockchain networks are often large-scale, requiring transactions to be synchronized across the entire network to reach consensus. On-chain computations can be prohibitively expensive, making many CPU-intensive computations infeasible. Inspired by the structure of IBM's token ring networks, we propose a lightweight consensus protocol called Monadring to address these issues. Monadring allows nodes within a large blockchain network to form smaller subnetworks, enabling faster and more cost-effective computations while maintaining the security guarantees of the main blockchain network. To further enhance Monadring's security, we introduce a node rotation mechanism based on Verifiable Random Function (VRF) and blind voting using Fully Homomorphic Encryption (FHE) within the smaller subnetwork. Unlike the common voting-based election of validator nodes, Monadring leverages FHE to conceal voting information, eliminating the advantage of the last mover in the voting process. This paper details the design and implementation of the Monadring protocol and evaluates its performance and feasibility through simulation experiments. Our research contributes to enhancing the practical utility of blockchain technology in large-scale application scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.16094v1-abstract-full').style.display = 'none'; document.getElementById('2408.16094v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">23 pages, 3 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.11323">arXiv:2408.11323</a> <span> [<a href="https://arxiv.org/pdf/2408.11323">pdf</a>, <a href="https://arxiv.org/format/2408.11323">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Optimizing Transmit Field Inhomogeneity of Parallel RF Transmit Design in 7T MRI using Deep Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lu%2C+Z">Zhengyi Lu</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+H">Hao Liang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiao Wang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+X">Xinqiang Yan</a>, <a href="/search/cs?searchtype=author&query=Huo%2C+Y">Yuankai Huo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.11323v2-abstract-short" style="display: inline;"> Ultrahigh field (UHF) Magnetic Resonance Imaging (MRI) provides a higher signal-to-noise ratio and, thereby, higher spatial resolution. However, UHF MRI introduces challenges such as transmit radiofrequency (RF) field (B1+) inhomogeneities, leading to uneven flip angles and image intensity anomalies. These issues can significantly degrade imaging quality and its medical applications. This study ad… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.11323v2-abstract-full').style.display = 'inline'; document.getElementById('2408.11323v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.11323v2-abstract-full" style="display: none;"> Ultrahigh field (UHF) Magnetic Resonance Imaging (MRI) provides a higher signal-to-noise ratio and, thereby, higher spatial resolution. However, UHF MRI introduces challenges such as transmit radiofrequency (RF) field (B1+) inhomogeneities, leading to uneven flip angles and image intensity anomalies. These issues can significantly degrade imaging quality and its medical applications. This study addresses B1+ field homogeneity through a novel deep learning-based strategy. Traditional methods like Magnitude Least Squares (MLS) optimization have been effective but are time-consuming and dependent on the patient's presence. Recent machine learning approaches, such as RF Shim Prediction by Iteratively Projected Ridge Regression and deep learning frameworks, have shown promise but face limitations like extensive training times and oversimplified architectures. We propose a two-step deep learning strategy. First, we obtain the desired reference RF shimming weights from multi-channel B1+ fields using random-initialized Adaptive Moment Estimation. Then, we employ Residual Networks (ResNets) to train a model that maps B1+ fields to target RF shimming outputs. Our approach does not rely on pre-calculated reference optimizations for the testing process and efficiently learns residual functions. Comparative studies with traditional MLS optimization demonstrate our method's advantages in terms of speed and accuracy. The proposed strategy achieves a faster and more efficient RF shimming design, significantly improving imaging quality at UHF. This advancement holds potential for broader applications in medical imaging and diagnostics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.11323v2-abstract-full').style.display = 'none'; document.getElementById('2408.11323v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.09762">arXiv:2408.09762</a> <span> [<a href="https://arxiv.org/pdf/2408.09762">pdf</a>, <a href="https://arxiv.org/format/2408.09762">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Sequential Federated Learning in Hierarchical Architecture on Non-IID Datasets </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yan%2C+X">Xingrun Yan</a>, <a href="/search/cs?searchtype=author&query=Zuo%2C+S">Shiyuan Zuo</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+R">Rongfei Fan</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+H">Han Hu</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+L">Li Shen</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+P">Puning Zhao</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+Y">Yong Luo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.09762v1-abstract-short" style="display: inline;"> In a real federated learning (FL) system, communication overhead for passing model parameters between the clients and the parameter server (PS) is often a bottleneck. Hierarchical federated learning (HFL) that poses multiple edge servers (ESs) between clients and the PS can partially alleviate communication pressure but still needs the aggregation of model parameters from multiple ESs at the PS. T… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.09762v1-abstract-full').style.display = 'inline'; document.getElementById('2408.09762v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.09762v1-abstract-full" style="display: none;"> In a real federated learning (FL) system, communication overhead for passing model parameters between the clients and the parameter server (PS) is often a bottleneck. Hierarchical federated learning (HFL) that poses multiple edge servers (ESs) between clients and the PS can partially alleviate communication pressure but still needs the aggregation of model parameters from multiple ESs at the PS. To further reduce communication overhead, we bring sequential FL (SFL) into HFL for the first time, which removes the central PS and enables the model training to be completed only through passing the global model between two adjacent ESs for each iteration, and propose a novel algorithm adaptive to such a combinational framework, referred to as Fed-CHS. Convergence results are derived for strongly convex and non-convex loss functions under various data heterogeneity setups, which show comparable convergence performance with the algorithms for HFL or SFL solely. Experimental results provide evidence of the superiority of our proposed Fed-CHS on both communication overhead saving and test accuracy over baseline methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.09762v1-abstract-full').style.display = 'none'; document.getElementById('2408.09762v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.09539">arXiv:2408.09539</a> <span> [<a href="https://arxiv.org/pdf/2408.09539">pdf</a>, <a href="https://arxiv.org/format/2408.09539">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> Byzantine-resilient Federated Learning Employing Normalized Gradients on Non-IID Datasets </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zuo%2C+S">Shiyuan Zuo</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+X">Xingrun Yan</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+R">Rongfei Fan</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+L">Li Shen</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+P">Puning Zhao</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jie Xu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+H">Han Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.09539v1-abstract-short" style="display: inline;"> In practical federated learning (FL) systems, the presence of malicious Byzantine attacks and data heterogeneity often introduces biases into the learning process. However, existing Byzantine-robust methods typically only achieve a compromise between adaptability to different loss function types (including both strongly convex and non-convex) and robustness to heterogeneous datasets, but with non-… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.09539v1-abstract-full').style.display = 'inline'; document.getElementById('2408.09539v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.09539v1-abstract-full" style="display: none;"> In practical federated learning (FL) systems, the presence of malicious Byzantine attacks and data heterogeneity often introduces biases into the learning process. However, existing Byzantine-robust methods typically only achieve a compromise between adaptability to different loss function types (including both strongly convex and non-convex) and robustness to heterogeneous datasets, but with non-zero optimality gap. Moreover, this compromise often comes at the cost of high computational complexity for aggregation, which significantly slows down the training speed. To address this challenge, we propose a federated learning approach called Federated Normalized Gradients Algorithm (Fed-NGA). Fed-NGA simply normalizes the uploaded local gradients to be unit vectors before aggregation, achieving a time complexity of $\mathcal{O}(pM)$, where $p$ represents the dimension of model parameters and $M$ is the number of participating clients. This complexity scale achieves the best level among all the existing Byzantine-robust methods. Furthermore, through rigorous proof, we demonstrate that Fed-NGA transcends the trade-off between adaptability to loss function type and data heterogeneity and the limitation of non-zero optimality gap in existing literature. Specifically, Fed-NGA can adapt to both non-convex loss functions and non-IID datasets simultaneously, with zero optimality gap at a rate of $\mathcal{O} (1/T^{\frac{1}{2} - 未})$, where T is the iteration number and $未\in (0,\frac{1}{2})$. In cases where the loss function is strongly convex, the zero optimality gap achieving rate can be improved to be linear. Experimental results provide evidence of the superiority of our proposed Fed-NGA on time complexity and convergence performance over baseline methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.09539v1-abstract-full').style.display = 'none'; document.getElementById('2408.09539v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.08191">arXiv:2408.08191</a> <span> [<a href="https://arxiv.org/pdf/2408.08191">pdf</a>, <a href="https://arxiv.org/format/2408.08191">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Beyond Full Labels: Energy-Double-Guided Single-Point Prompt for Infrared Small Target Label Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yuan%2C+S">Shuai Yuan</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+H">Hanlin Qin</a>, <a href="/search/cs?searchtype=author&query=Kou%2C+R">Renke Kou</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+X">Xiang Yan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zechuan Li</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+C">Chenxu Peng</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+H">Huixin Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.08191v5-abstract-short" style="display: inline;"> We pioneer a learning-based single-point prompt paradigm for infrared small target label generation (IRSTLG) to lobber annotation burdens. Unlike previous clustering-based methods, our intuition is that point-guided mask generation just requires one more prompt than target detection, i.e., IRSTLG can be treated as an infrared small target detection (IRSTD) with the location hint. Therefore, we pro… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.08191v5-abstract-full').style.display = 'inline'; document.getElementById('2408.08191v5-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.08191v5-abstract-full" style="display: none;"> We pioneer a learning-based single-point prompt paradigm for infrared small target label generation (IRSTLG) to lobber annotation burdens. Unlike previous clustering-based methods, our intuition is that point-guided mask generation just requires one more prompt than target detection, i.e., IRSTLG can be treated as an infrared small target detection (IRSTD) with the location hint. Therefore, we propose an elegant yet effective Energy-Double-Guided Single-point Prompt (EDGSP) framework, aiming to adeptly transform a coarse IRSTD network into a refined label generation method. Specifically, EDGSP comprises three key modules: 1) target energy initialization (TEI), which establishes a foundational outline to streamline the mapping process for effective shape evolution, 2) double prompt embedding (DPE) for rapidly localizing interesting regions and reinforcing high-resolution individual edges to avoid label adhesion, and 3) bounding box-based matching (BBM) for eliminating false masks via considering comprehensive cluster boundary conditions to obtain a reliable output. In this way, pseudo labels generated by three backbones equipped with our EDGSP achieve 100% object-level probability of detection (Pd) and 0% false-alarm rate (Fa) on SIRST, NUDT-SIRST, and IRSTD-1k datasets, with a pixel-level intersection over union (IoU) improvement of 13.28% over state-of-the-art (SOTA) label generation methods. Further applying our inferred masks to train detection models, EDGSP, for the first time, enables a single-point-generated pseudo mask to surpass the manual labels. Even with coarse single-point annotations, it still achieves 99.5% performance of full labeling. Code is available at https://github.com/xdFai/EDGSP. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.08191v5-abstract-full').style.display = 'none'; document.getElementById('2408.08191v5-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Updated the title to better reflect the content of the paper</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.04967">arXiv:2408.04967</a> <span> [<a href="https://arxiv.org/pdf/2408.04967">pdf</a>, <a href="https://arxiv.org/format/2408.04967">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> ADD 2023: Towards Audio Deepfake Detection and Analysis in the Wild </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yi%2C+J">Jiangyan Yi</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C+Y">Chu Yuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+J">Jianhua Tao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chenglong Wang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+X">Xinrui Yan</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+Y">Yong Ren</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+H">Hao Gu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Junzuo Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.04967v3-abstract-short" style="display: inline;"> The growing prominence of the field of audio deepfake detection is driven by its wide range of applications, notably in protecting the public from potential fraud and other malicious activities, prompting the need for greater attention and research in this area. The ADD 2023 challenge goes beyond binary real/fake classification by emulating real-world scenarios, such as the identification of manip… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.04967v3-abstract-full').style.display = 'inline'; document.getElementById('2408.04967v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.04967v3-abstract-full" style="display: none;"> The growing prominence of the field of audio deepfake detection is driven by its wide range of applications, notably in protecting the public from potential fraud and other malicious activities, prompting the need for greater attention and research in this area. The ADD 2023 challenge goes beyond binary real/fake classification by emulating real-world scenarios, such as the identification of manipulated intervals in partially fake audio and determining the source responsible for generating any fake audio, both with real-life implications, notably in audio forensics, law enforcement, and construction of reliable and trustworthy evidence. To further foster research in this area, in this article, we describe the dataset that was used in the fake game, manipulation region location and deepfake algorithm recognition tracks of the challenge. We also focus on the analysis of the technical methodologies by the top-performing participants in each task and note the commonalities and differences in their approaches. Finally, we discuss the current technical limitations as identified through the technical analysis, and provide a roadmap for future research directions. The dataset is available for download at http://addchallenge.cn/downloadADD2023. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.04967v3-abstract-full').style.display = 'none'; document.getElementById('2408.04967v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This work has been submitted to the IEEE for possible publication</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.03178">arXiv:2408.03178</a> <span> [<a href="https://arxiv.org/pdf/2408.03178">pdf</a>, <a href="https://arxiv.org/format/2408.03178">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> An Object is Worth 64x64 Pixels: Generating 3D Object via Image Diffusion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yan%2C+X">Xingguang Yan</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+H">Han-Hung Lee</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+Z">Ziyu Wan</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+A+X">Angel X. Chang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.03178v1-abstract-short" style="display: inline;"> We introduce a new approach for generating realistic 3D models with UV maps through a representation termed "Object Images." This approach encapsulates surface geometry, appearance, and patch structures within a 64x64 pixel image, effectively converting complex 3D shapes into a more manageable 2D format. By doing so, we address the challenges of both geometric and semantic irregularity inherent in… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.03178v1-abstract-full').style.display = 'inline'; document.getElementById('2408.03178v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.03178v1-abstract-full" style="display: none;"> We introduce a new approach for generating realistic 3D models with UV maps through a representation termed "Object Images." This approach encapsulates surface geometry, appearance, and patch structures within a 64x64 pixel image, effectively converting complex 3D shapes into a more manageable 2D format. By doing so, we address the challenges of both geometric and semantic irregularity inherent in polygonal meshes. This method allows us to use image generation models, such as Diffusion Transformers, directly for 3D shape generation. Evaluated on the ABO dataset, our generated shapes with patch structures achieve point cloud FID comparable to recent 3D generative models, while naturally supporting PBR material generation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.03178v1-abstract-full').style.display = 'none'; document.getElementById('2408.03178v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project Page: https://omages.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.01691">arXiv:2408.01691</a> <span> [<a href="https://arxiv.org/pdf/2408.01691">pdf</a>, <a href="https://arxiv.org/format/2408.01691">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> TreeCSS: An Efficient Framework for Vertical Federated Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Qinbo Zhang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+X">Xiao Yan</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+Y">Yukai Ding</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Q">Quanqing Xu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+C">Chuang Hu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+X">Xiaokai Zhou</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+J">Jiawei Jiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.01691v1-abstract-short" style="display: inline;"> Vertical federated learning (VFL) considers the case that the features of data samples are partitioned over different participants. VFL consists of two main steps, i.e., identify the common data samples for all participants (alignment) and train model using the aligned data samples (training). However, when there are many participants and data samples, both alignment and training become slow. As s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.01691v1-abstract-full').style.display = 'inline'; document.getElementById('2408.01691v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.01691v1-abstract-full" style="display: none;"> Vertical federated learning (VFL) considers the case that the features of data samples are partitioned over different participants. VFL consists of two main steps, i.e., identify the common data samples for all participants (alignment) and train model using the aligned data samples (training). However, when there are many participants and data samples, both alignment and training become slow. As such, we propose TreeCSS as an efficient VFL framework that accelerates the two main steps. In particular, for sample alignment, we design an efficient multi-party private set intersection (MPSI) protocol called Tree-MPSI, which adopts a tree-based structure and a data-volume-aware scheduling strategy to parallelize alignment among the participants. As model training time scales with the number of data samples, we conduct coreset selection (CSS) to choose some representative data samples for training. Our CCS method adopts a clustering-based scheme for security and generality, which first clusters the features locally on each participant and then merges the local clustering results to select representative samples. In addition, we weight the samples according to their distances to the centroids to reflect their importance to model training. We evaluate the effectiveness and efficiency of our TreeCSS framework on various datasets and models. The results show that compared with vanilla VFL, TreeCSS accelerates training by up to 2.93x and achieves comparable model accuracy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.01691v1-abstract-full').style.display = 'none'; document.getElementById('2408.01691v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">16 pages, 7 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.20224">arXiv:2407.20224</a> <span> [<a href="https://arxiv.org/pdf/2407.20224">pdf</a>, <a href="https://arxiv.org/format/2407.20224">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Can Editing LLMs Inject Harm? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+C">Canyu Chen</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+B">Baixiang Huang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zekun Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhaorun Chen</a>, <a href="/search/cs?searchtype=author&query=Lai%2C+S">Shiyang Lai</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xiongxiao Xu</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+J">Jia-Chen Gu</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+J">Jindong Gu</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+H">Huaxiu Yao</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+C">Chaowei Xiao</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+X">Xifeng Yan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W+Y">William Yang Wang</a>, <a href="/search/cs?searchtype=author&query=Torr%2C+P">Philip Torr</a>, <a href="/search/cs?searchtype=author&query=Song%2C+D">Dawn Song</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+K">Kai Shu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.20224v3-abstract-short" style="display: inline;"> Knowledge editing has been increasingly adopted to correct the false or outdated knowledge in Large Language Models (LLMs). Meanwhile, one critical but under-explored question is: can knowledge editing be used to inject harm into LLMs? In this paper, we propose to reformulate knowledge editing as a new type of safety threat for LLMs, namely Editing Attack, and conduct a systematic investigation wi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.20224v3-abstract-full').style.display = 'inline'; document.getElementById('2407.20224v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.20224v3-abstract-full" style="display: none;"> Knowledge editing has been increasingly adopted to correct the false or outdated knowledge in Large Language Models (LLMs). Meanwhile, one critical but under-explored question is: can knowledge editing be used to inject harm into LLMs? In this paper, we propose to reformulate knowledge editing as a new type of safety threat for LLMs, namely Editing Attack, and conduct a systematic investigation with a newly constructed dataset EditAttack. Specifically, we focus on two typical safety risks of Editing Attack including Misinformation Injection and Bias Injection. For the risk of misinformation injection, we first categorize it into commonsense misinformation injection and long-tail misinformation injection. Then, we find that editing attacks can inject both types of misinformation into LLMs, and the effectiveness is particularly high for commonsense misinformation injection. For the risk of bias injection, we discover that not only can biased sentences be injected into LLMs with high effectiveness, but also one single biased sentence injection can cause a bias increase in general outputs of LLMs, which are even highly irrelevant to the injected sentence, indicating a catastrophic impact on the overall fairness of LLMs. Then, we further illustrate the high stealthiness of editing attacks, measured by their impact on the general knowledge and reasoning capacities of LLMs, and show the hardness of defending editing attacks with empirical evidence. Our discoveries demonstrate the emerging misuse risks of knowledge editing techniques on compromising the safety alignment of LLMs and the feasibility of disseminating misinformation or bias with LLMs as new channels. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.20224v3-abstract-full').style.display = 'none'; document.getElementById('2407.20224v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The first two authors contributed equally. 9 pages for main paper, 36 pages including appendix. The code, results, dataset for this paper and more resources are on the project website: https://llm-editing.github.io</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.16943">arXiv:2407.16943</a> <span> [<a href="https://arxiv.org/pdf/2407.16943">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> McGAN: Generating Manufacturable Designs by Embedding Manufacturing Rules into Conditional Generative Adversarial Network </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhichao Wang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+X">Xiaoliang Yan</a>, <a href="/search/cs?searchtype=author&query=Melkote%2C+S">Shreyes Melkote</a>, <a href="/search/cs?searchtype=author&query=Rosen%2C+D">David Rosen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.16943v1-abstract-short" style="display: inline;"> Generative design (GD) methods aim to automatically generate a wide variety of designs that satisfy functional or aesthetic design requirements. However, research to date generally lacks considerations of manufacturability of the generated designs. To this end, we propose a novel GD approach by using deep neural networks to encode design for manufacturing (DFM) rules, thereby modifying part design… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.16943v1-abstract-full').style.display = 'inline'; document.getElementById('2407.16943v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.16943v1-abstract-full" style="display: none;"> Generative design (GD) methods aim to automatically generate a wide variety of designs that satisfy functional or aesthetic design requirements. However, research to date generally lacks considerations of manufacturability of the generated designs. To this end, we propose a novel GD approach by using deep neural networks to encode design for manufacturing (DFM) rules, thereby modifying part designs to make them manufacturable by a given manufacturing process. Specifically, a three-step approach is proposed: first, an instance segmentation method, Mask R-CNN, is used to decompose a part design into subregions. Second, a conditional generative adversarial neural network (cGAN), Pix2Pix, transforms unmanufacturable decomposed subregions into manufacturable subregions. The transformed subregions of designs are subsequently reintegrated into a unified manufacturable design. These three steps, Mask-RCNN, Pix2Pix, and reintegration, form the basis of the proposed Manufacturable conditional GAN (McGAN) framework. Experimental results show that McGAN can transform existing unmanufacturable designs to generate their corresponding manufacturable counterparts automatically that realize the specified manufacturing rules in an efficient and robust manner. The effectiveness of McGAN is demonstrated through two-dimensional design case studies of an injection molding process. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.16943v1-abstract-full').style.display = 'none'; document.getElementById('2407.16943v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Yan%2C+X&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Yan%2C+X&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Yan%2C+X&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Yan%2C+X&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Yan%2C+X&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Yan%2C+X&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>