CINXE.COM
Search | arXiv e-print repository
<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 5,302 results for author: <span class="mathjax">Li, Z</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Li%2C+Z">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Li, Z"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Li%2C+Z&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Li, Z"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Li%2C+Z&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Li%2C+Z&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Li%2C+Z&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Li%2C+Z&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Li%2C+Z&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Li%2C+Z&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14295">arXiv:2411.14295</a> <span> [<a href="https://arxiv.org/pdf/2411.14295">pdf</a>, <a href="https://arxiv.org/format/2411.14295">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> StereoCrafter-Zero: Zero-Shot Stereo Video Generation with Noisy Restart </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shi%2C+J">Jian Shi</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qian Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhenyu Li</a>, <a href="/search/cs?searchtype=author&query=Wonka%2C+P">Peter Wonka</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14295v1-abstract-short" style="display: inline;"> Generating high-quality stereo videos that mimic human binocular vision requires maintaining consistent depth perception and temporal coherence across frames. While diffusion models have advanced image and video synthesis, generating high-quality stereo videos remains challenging due to the difficulty of maintaining consistent temporal and spatial coherence between left and right views. We introdu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14295v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14295v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14295v1-abstract-full" style="display: none;"> Generating high-quality stereo videos that mimic human binocular vision requires maintaining consistent depth perception and temporal coherence across frames. While diffusion models have advanced image and video synthesis, generating high-quality stereo videos remains challenging due to the difficulty of maintaining consistent temporal and spatial coherence between left and right views. We introduce \textit{StereoCrafter-Zero}, a novel framework for zero-shot stereo video generation that leverages video diffusion priors without the need for paired training data. Key innovations include a noisy restart strategy to initialize stereo-aware latents and an iterative refinement process that progressively harmonizes the latent space, addressing issues like temporal flickering and view inconsistencies. Comprehensive evaluations, including quantitative metrics and user studies, demonstrate that \textit{StereoCrafter-Zero} produces high-quality stereo videos with improved depth consistency and temporal smoothness, even when depth estimations are imperfect. Our framework is robust and adaptable across various diffusion models, setting a new benchmark for zero-shot stereo video generation and enabling more immersive visual experiences. Our code can be found in~\url{https://github.com/shijianjian/StereoCrafter-Zero}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14295v1-abstract-full').style.display = 'none'; document.getElementById('2411.14295v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13914">arXiv:2411.13914</a> <span> [<a href="https://arxiv.org/pdf/2411.13914">pdf</a>, <a href="https://arxiv.org/format/2411.13914">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> ICODE: Modeling Dynamical Systems with Extrinsic Input Information </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhaoyi Li</a>, <a href="/search/cs?searchtype=author&query=Mei%2C+W">Wenjie Mei</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+K">Ke Yu</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+Y">Yang Bai</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shihua Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13914v1-abstract-short" style="display: inline;"> Learning models of dynamical systems with external inputs, that may be, for example, nonsmooth or piecewise, is crucial for studying complex phenomena and predicting future state evolution, which is essential for applications such as safety guarantees and decision-making. In this work, we introduce \emph{Input Concomitant Neural ODEs (ICODEs)}, which incorporate precise real-time input information… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13914v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13914v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13914v1-abstract-full" style="display: none;"> Learning models of dynamical systems with external inputs, that may be, for example, nonsmooth or piecewise, is crucial for studying complex phenomena and predicting future state evolution, which is essential for applications such as safety guarantees and decision-making. In this work, we introduce \emph{Input Concomitant Neural ODEs (ICODEs)}, which incorporate precise real-time input information into the learning process of the models, rather than treating the inputs as hidden parameters to be learned. The sufficient conditions to ensure the model's contraction property are provided to guarantee that system trajectories of the trained model converge to a fixed point, regardless of initial conditions across different training processes. We validate our method through experiments on several representative real dynamics: Single-link robot, DC-to-DC converter, motion dynamics of a rigid body, Rabinovich-Fabrikant equation, Glycolytic-glycogenolytic pathway model, and heat conduction equation. The experimental results demonstrate that our proposed ICODEs efficiently learn the ground truth systems, achieving superior prediction performance under both typical and atypical inputs. This work offers a valuable class of neural ODE models for understanding physical systems with explicit external input information, with potential promising applications in fields such as physics and robotics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13914v1-abstract-full').style.display = 'none'; document.getElementById('2411.13914v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13807">arXiv:2411.13807</a> <span> [<a href="https://arxiv.org/pdf/2411.13807">pdf</a>, <a href="https://arxiv.org/format/2411.13807">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MagicDriveDiT: High-Resolution Long Video Generation for Autonomous Driving with Adaptive Control </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gao%2C+R">Ruiyuan Gao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+K">Kai Chen</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+B">Bo Xiao</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+L">Lanqing Hong</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhenguo Li</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Q">Qiang Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13807v1-abstract-short" style="display: inline;"> The rapid advancement of diffusion models has greatly improved video synthesis, especially in controllable video generation, which is essential for applications like autonomous driving. However, existing methods are limited by scalability and how control conditions are integrated, failing to meet the needs for high-resolution and long videos for autonomous driving applications. In this paper, we i… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13807v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13807v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13807v1-abstract-full" style="display: none;"> The rapid advancement of diffusion models has greatly improved video synthesis, especially in controllable video generation, which is essential for applications like autonomous driving. However, existing methods are limited by scalability and how control conditions are integrated, failing to meet the needs for high-resolution and long videos for autonomous driving applications. In this paper, we introduce MagicDriveDiT, a novel approach based on the DiT architecture, and tackle these challenges. Our method enhances scalability through flow matching and employs a progressive training strategy to manage complex scenarios. By incorporating spatial-temporal conditional encoding, MagicDriveDiT achieves precise control over spatial-temporal latents. Comprehensive experiments show its superior performance in generating realistic street scene videos with higher resolution and more frames. MagicDriveDiT significantly improves video generation quality and spatial-temporal controls, expanding its potential applications across various tasks in autonomous driving. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13807v1-abstract-full').style.display = 'none'; document.getElementById('2411.13807v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project Website: https://flymin.github.io/magicdrivedit/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13602">arXiv:2411.13602</a> <span> [<a href="https://arxiv.org/pdf/2411.13602">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Large-scale cross-modality pretrained model enhances cardiovascular state estimation and cardiomyopathy detection from electrocardiograms: An AI system development and multi-center validation study </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ding%2C+Z">Zhengyao Ding</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yujian Hu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Youyao Xu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+C">Chengchen Zhao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Ziyu Li</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+Y">Yiheng Mao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Haitao Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Q">Qian Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jing Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yue Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+M">Mengjia Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Longbo Wang</a>, <a href="/search/cs?searchtype=author&query=Chu%2C+X">Xuesen Chu</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+W">Weichao Pan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Ziyi Liu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+F">Fei Wu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hongkun Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Ting Chen</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Z">Zhengxing Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13602v1-abstract-short" style="display: inline;"> Cardiovascular diseases (CVDs) present significant challenges for early and accurate diagnosis. While cardiac magnetic resonance imaging (CMR) is the gold standard for assessing cardiac function and diagnosing CVDs, its high cost and technical complexity limit accessibility. In contrast, electrocardiography (ECG) offers promise for large-scale early screening. This study introduces CardiacNets, an… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13602v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13602v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13602v1-abstract-full" style="display: none;"> Cardiovascular diseases (CVDs) present significant challenges for early and accurate diagnosis. While cardiac magnetic resonance imaging (CMR) is the gold standard for assessing cardiac function and diagnosing CVDs, its high cost and technical complexity limit accessibility. In contrast, electrocardiography (ECG) offers promise for large-scale early screening. This study introduces CardiacNets, an innovative model that enhances ECG analysis by leveraging the diagnostic strengths of CMR through cross-modal contrastive learning and generative pretraining. CardiacNets serves two primary functions: (1) it evaluates detailed cardiac function indicators and screens for potential CVDs, including coronary artery disease, cardiomyopathy, pericarditis, heart failure and pulmonary hypertension, using ECG input; and (2) it enhances interpretability by generating high-quality CMR images from ECG data. We train and validate the proposed CardiacNets on two large-scale public datasets (the UK Biobank with 41,519 individuals and the MIMIC-IV-ECG comprising 501,172 samples) as well as three private datasets (FAHZU with 410 individuals, SAHZU with 464 individuals, and QPH with 338 individuals), and the findings demonstrate that CardiacNets consistently outperforms traditional ECG-only models, substantially improving screening accuracy. Furthermore, the generated CMR images provide valuable diagnostic support for physicians of all experience levels. This proof-of-concept study highlights how ECG can facilitate cross-modal insights into cardiac function assessment, paving the way for enhanced CVD screening and diagnosis at a population level. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13602v1-abstract-full').style.display = 'none'; document.getElementById('2411.13602v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">23 pages, 8 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13553">arXiv:2411.13553</a> <span> [<a href="https://arxiv.org/pdf/2411.13553">pdf</a>, <a href="https://arxiv.org/format/2411.13553">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> AI-generated Image Detection: Passive or Watermark? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+M">Moyang Guo</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yuepeng Hu</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Zhengyuan Jiang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zeyu Li</a>, <a href="/search/cs?searchtype=author&query=Sadovnik%2C+A">Amir Sadovnik</a>, <a href="/search/cs?searchtype=author&query=Daw%2C+A">Arka Daw</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+N">Neil Gong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13553v1-abstract-short" style="display: inline;"> While text-to-image models offer numerous benefits, they also pose significant societal risks. Detecting AI-generated images is crucial for mitigating these risks. Detection methods can be broadly categorized into passive and watermark-based approaches: passive detectors rely on artifacts present in AI-generated images, whereas watermark-based detectors proactively embed watermarks into such image… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13553v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13553v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13553v1-abstract-full" style="display: none;"> While text-to-image models offer numerous benefits, they also pose significant societal risks. Detecting AI-generated images is crucial for mitigating these risks. Detection methods can be broadly categorized into passive and watermark-based approaches: passive detectors rely on artifacts present in AI-generated images, whereas watermark-based detectors proactively embed watermarks into such images. A key question is which type of detector performs better in terms of effectiveness, robustness, and efficiency. However, the current literature lacks a comprehensive understanding of this issue. In this work, we aim to bridge that gap by developing ImageDetectBench, the first comprehensive benchmark to compare the effectiveness, robustness, and efficiency of passive and watermark-based detectors. Our benchmark includes four datasets, each containing a mix of AI-generated and non-AI-generated images. We evaluate five passive detectors and four watermark-based detectors against eight types of common perturbations and three types of adversarial perturbations. Our benchmark results reveal several interesting findings. For instance, watermark-based detectors consistently outperform passive detectors, both in the presence and absence of perturbations. Based on these insights, we provide recommendations for detecting AI-generated images, e.g., when both types of detectors are applicable, watermark-based detectors should be the preferred choice. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13553v1-abstract-full').style.display = 'none'; document.getElementById('2411.13553v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13154">arXiv:2411.13154</a> <span> [<a href="https://arxiv.org/pdf/2411.13154">pdf</a>, <a href="https://arxiv.org/format/2411.13154">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> DMQR-RAG: Diverse Multi-Query Rewriting for RAG </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhicong Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jiahao Wang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Zhishu Jiang</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+H">Hangyu Mao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhongxia Chen</a>, <a href="/search/cs?searchtype=author&query=Du%2C+J">Jiazhen Du</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yuanxing Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+F">Fuzheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+D">Di Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yong Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13154v1-abstract-short" style="display: inline;"> Large language models often encounter challenges with static knowledge and hallucinations, which undermine their reliability. Retrieval-augmented generation (RAG) mitigates these issues by incorporating external information. However, user queries frequently contain noise and intent deviations, necessitating query rewriting to improve the relevance of retrieved documents. In this paper, we introduc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13154v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13154v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13154v1-abstract-full" style="display: none;"> Large language models often encounter challenges with static knowledge and hallucinations, which undermine their reliability. Retrieval-augmented generation (RAG) mitigates these issues by incorporating external information. However, user queries frequently contain noise and intent deviations, necessitating query rewriting to improve the relevance of retrieved documents. In this paper, we introduce DMQR-RAG, a Diverse Multi-Query Rewriting framework designed to improve the performance of both document retrieval and final responses in RAG. Specifically, we investigate how queries with varying information quantities can retrieve a diverse array of documents, presenting four rewriting strategies that operate at different levels of information to enhance the performance of baseline approaches. Additionally, we propose an adaptive strategy selection method that minimizes the number of rewrites while optimizing overall performance. Our methods have been rigorously validated through extensive experiments conducted in both academic and industry settings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13154v1-abstract-full').style.display = 'none'; document.getElementById('2411.13154v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12995">arXiv:2411.12995</a> <span> [<a href="https://arxiv.org/pdf/2411.12995">pdf</a>, <a href="https://arxiv.org/format/2411.12995">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> </div> </div> <p class="title is-5 mathjax"> Eliminating Ratio Bias for Gradient-based Simulated Parameter Estimation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zehao Li</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+Y">Yijie Peng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12995v1-abstract-short" style="display: inline;"> This article addresses the challenge of parameter calibration in stochastic models where the likelihood function is not analytically available. We propose a gradient-based simulated parameter estimation framework, leveraging a multi-time scale algorithm that tackles the issue of ratio bias in both maximum likelihood estimation and posterior density estimation problems. Additionally, we introduce a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12995v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12995v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12995v1-abstract-full" style="display: none;"> This article addresses the challenge of parameter calibration in stochastic models where the likelihood function is not analytically available. We propose a gradient-based simulated parameter estimation framework, leveraging a multi-time scale algorithm that tackles the issue of ratio bias in both maximum likelihood estimation and posterior density estimation problems. Additionally, we introduce a nested simulation optimization structure, providing theoretical analyses including strong convergence, asymptotic normality, convergence rate, and budget allocation strategies for the proposed algorithm. The framework is further extended to neural network training, offering a novel perspective on stochastic approximation in machine learning. Numerical experiments show that our algorithm can improve the estimation accuracy and save computational costs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12995v1-abstract-full').style.display = 'none'; document.getElementById('2411.12995v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12773">arXiv:2411.12773</a> <span> [<a href="https://arxiv.org/pdf/2411.12773">pdf</a>, <a href="https://arxiv.org/format/2411.12773">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Decoupling Training-Free Guided Diffusion by ADMM </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Youyuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zehua Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zenan Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhaoyu Li</a>, <a href="/search/cs?searchtype=author&query=Clark%2C+J+J">James J. Clark</a>, <a href="/search/cs?searchtype=author&query=Si%2C+X">Xujie Si</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12773v1-abstract-short" style="display: inline;"> In this paper, we consider the conditional generation problem by guiding off-the-shelf unconditional diffusion models with differentiable loss functions in a plug-and-play fashion. While previous research has primarily focused on balancing the unconditional diffusion model and the guided loss through a tuned weight hyperparameter, we propose a novel framework that distinctly decouples these two co… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12773v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12773v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12773v1-abstract-full" style="display: none;"> In this paper, we consider the conditional generation problem by guiding off-the-shelf unconditional diffusion models with differentiable loss functions in a plug-and-play fashion. While previous research has primarily focused on balancing the unconditional diffusion model and the guided loss through a tuned weight hyperparameter, we propose a novel framework that distinctly decouples these two components. Specifically, we introduce two variables ${x}$ and ${z}$, to represent the generated samples governed by the unconditional generation model and the guidance function, respectively. This decoupling reformulates conditional generation into two manageable subproblems, unified by the constraint ${x} = {z}$. Leveraging this setup, we develop a new algorithm based on the Alternating Direction Method of Multipliers (ADMM) to adaptively balance these components. Additionally, we establish the equivalence between the diffusion reverse step and the proximal operator of ADMM and provide a detailed convergence analysis of our algorithm under certain mild assumptions. Our experiments demonstrate that our proposed method ADMMDiff consistently generates high-quality samples while ensuring strong adherence to the conditioning criteria. It outperforms existing methods across a range of conditional generation tasks, including image generation with various guidance and controllable motion synthesis. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12773v1-abstract-full').style.display = 'none'; document.getElementById('2411.12773v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12363">arXiv:2411.12363</a> <span> [<a href="https://arxiv.org/pdf/2411.12363">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> DGSNA: prompt-based Dynamic Generative Scene-based Noise Addition method </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zihao Chen</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Z">Zhentao Lin</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+B">Bi Zeng</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+L">Linyi Huang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhi Li</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+J">Jia Cai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12363v1-abstract-short" style="display: inline;"> This paper addresses the challenges of accurately enumerating and describing scenes and the labor-intensive process required to replicate acoustic environments using non-generative methods. We introduce the prompt-based Dynamic Generative Sce-ne-based Noise Addition method (DGSNA), which innovatively combines the Dynamic Generation of Scene Information (DGSI) with Scene-based Noise Addition for Au… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12363v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12363v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12363v1-abstract-full" style="display: none;"> This paper addresses the challenges of accurately enumerating and describing scenes and the labor-intensive process required to replicate acoustic environments using non-generative methods. We introduce the prompt-based Dynamic Generative Sce-ne-based Noise Addition method (DGSNA), which innovatively combines the Dynamic Generation of Scene Information (DGSI) with Scene-based Noise Addition for Audio (SNAA). Employing generative chat models structured within the Back-ground-Examples-Task (BET) prompt framework, DGSI com-ponent facilitates the dynamic synthesis of tailored Scene Infor-mation (SI) for specific acoustic environments. Additionally, the SNAA component leverages Room Impulse Response (RIR) fil-ters and Text-To-Audio (TTA) systems to generate realistic, scene-based noise that can be adapted for both indoor and out-door environments. Through comprehensive experiments, the adaptability of DGSNA across different generative chat models was demonstrated. The results, assessed through both objective and subjective evaluations, show that DGSNA provides robust performance in dynamically generating precise SI and effectively enhancing scene-based noise addition capabilities, thus offering significant improvements over traditional methods in acoustic scene simulation. Our implementation and demos are available at https://dgsna.github.io. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12363v1-abstract-full').style.display = 'none'; document.getElementById('2411.12363v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12184">arXiv:2411.12184</a> <span> [<a href="https://arxiv.org/pdf/2411.12184">pdf</a>, <a href="https://arxiv.org/ps/2411.12184">ps</a>, <a href="https://arxiv.org/format/2411.12184">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Methodology">stat.ME</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Testability of Instrumental Variables in Additive Nonlinear, Non-Constant Effects Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+X">Xichen Guo</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zheng Li</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+B">Biwei Huang</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+Y">Yan Zeng</a>, <a href="/search/cs?searchtype=author&query=Geng%2C+Z">Zhi Geng</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+F">Feng Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12184v1-abstract-short" style="display: inline;"> We address the issue of the testability of instrumental variables derived from observational data. Most existing testable implications are centered on scenarios where the treatment is a discrete variable, e.g., instrumental inequality (Pearl, 1995), or where the effect is assumed to be constant, e.g., instrumental variables condition based on the principle of independent mechanisms (Burauel, 2023)… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12184v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12184v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12184v1-abstract-full" style="display: none;"> We address the issue of the testability of instrumental variables derived from observational data. Most existing testable implications are centered on scenarios where the treatment is a discrete variable, e.g., instrumental inequality (Pearl, 1995), or where the effect is assumed to be constant, e.g., instrumental variables condition based on the principle of independent mechanisms (Burauel, 2023). However, treatments can often be continuous variables, such as drug dosages or nutritional content levels, and non-constant effects may occur in many real-world scenarios. In this paper, we consider an additive nonlinear, non-constant effects model with unmeasured confounders, in which treatments can be either discrete or continuous, and propose an Auxiliary-based Independence Test (AIT) condition to test whether a variable is a valid instrument. We first show that if the candidate instrument is valid, then the AIT condition holds. Moreover, we illustrate the implications of the AIT condition and demonstrate that, in certain conditions, AIT conditions are necessary and sufficient to detect all invalid IVs. We also extend the AIT condition to include covariates and introduce a practical testing algorithm. Experimental results on both synthetic and three different real-world datasets show the effectiveness of our proposed condition. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12184v1-abstract-full').style.display = 'none'; document.getElementById('2411.12184v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11852">arXiv:2411.11852</a> <span> [<a href="https://arxiv.org/pdf/2411.11852">pdf</a>, <a href="https://arxiv.org/format/2411.11852">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3658617.3697687">10.1145/3658617.3697687 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> LUTMUL: Exceed Conventional FPGA Roofline Limit by LUT-based Efficient Multiplication for Neural Network Inference </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xie%2C+Y">Yanyue Xie</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhengang Li</a>, <a href="/search/cs?searchtype=author&query=Diaconu%2C+D">Dana Diaconu</a>, <a href="/search/cs?searchtype=author&query=Handagala%2C+S">Suranga Handagala</a>, <a href="/search/cs?searchtype=author&query=Leeser%2C+M">Miriam Leeser</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+X">Xue Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11852v1-abstract-short" style="display: inline;"> For FPGA-based neural network accelerators, digital signal processing (DSP) blocks have traditionally been the cornerstone for handling multiplications. This paper introduces LUTMUL, which harnesses the potential of look-up tables (LUTs) for performing multiplications. The availability of LUTs typically outnumbers that of DSPs by a factor of 100, offering a significant computational advantage. By… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11852v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11852v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11852v1-abstract-full" style="display: none;"> For FPGA-based neural network accelerators, digital signal processing (DSP) blocks have traditionally been the cornerstone for handling multiplications. This paper introduces LUTMUL, which harnesses the potential of look-up tables (LUTs) for performing multiplications. The availability of LUTs typically outnumbers that of DSPs by a factor of 100, offering a significant computational advantage. By exploiting this advantage of LUTs, our method demonstrates a potential boost in the performance of FPGA-based neural network accelerators with a reconfigurable dataflow architecture. Our approach challenges the conventional peak performance on DSP-based accelerators and sets a new benchmark for efficient neural network inference on FPGAs. Experimental results demonstrate that our design achieves the best inference speed among all FPGA-based accelerators, achieving a throughput of 1627 images per second and maintaining a top-1 accuracy of 70.95% on the ImageNet dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11852v1-abstract-full').style.display = 'none'; document.getElementById('2411.11852v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ASPDAC 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10960">arXiv:2411.10960</a> <span> [<a href="https://arxiv.org/pdf/2411.10960">pdf</a>, <a href="https://arxiv.org/format/2411.10960">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> </div> </div> <p class="title is-5 mathjax"> Beamforming Design and Multi-User Scheduling in Transmissive RIS Enabled Distributed Cooperative ISAC Networks with RSMA </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Ziwei Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wen Chen</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Q">Qingqing Wu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhendong Li</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Q">Qiong Wu</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+N">Nan Cheng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jun Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10960v1-abstract-short" style="display: inline;"> In this paper, we propose a novel transmissive reconfigurable intelligent surface (TRIS) transceiver-empowered distributed cooperative integrated sensing and communication (ISAC) network to enhance coverage as well as to enhance wireless environment understanding. Based on the network requirements, the users are categorized into cooperative users (CUEs) and destination users (DUEs), and the CUEs u… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10960v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10960v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10960v1-abstract-full" style="display: none;"> In this paper, we propose a novel transmissive reconfigurable intelligent surface (TRIS) transceiver-empowered distributed cooperative integrated sensing and communication (ISAC) network to enhance coverage as well as to enhance wireless environment understanding. Based on the network requirements, the users are categorized into cooperative users (CUEs) and destination users (DUEs), and the CUEs utilize their own resources to serve the DUEs. To realize cooperation, we implement rate-splitting multiple access (RSMA) at the base station (BS), where the common stream is decoded and reencoded at the CUEs and forwarded to the DUEs, while the private stream satisfies the CUEs' own communication requirements. We construct an optimization problem with maximum minimum radar mutual information (RMI) as the objective function to optimize the BS beamforming matrix, the CUE beamforming matrices, the common stream rate vectors, and the user scheduling vectors. Due to the coupling of the optimization variables and non-convex operation, the proposed problem is a non-convex optimization problem that cannot be solved directly. To address the above challenges, we adopt a consensus alternating direction method of multipliers (ADMM) framework to decouple the optimization variables and solve it. Specifically, the problem is decoupled into multiple subproblems and solved by iterative optimization independently until overall convergence is achieved. Finally, numerical results validate the superiority of the proposed scheme in terms of improving communication sum-rate and RMI, and greatly reduce the algorithm complexity. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10960v1-abstract-full').style.display = 'none'; document.getElementById('2411.10960v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10830">arXiv:2411.10830</a> <span> [<a href="https://arxiv.org/pdf/2411.10830">pdf</a>, <a href="https://arxiv.org/format/2411.10830">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> </div> </div> <p class="title is-5 mathjax"> One-Layer Transformer Provably Learns One-Nearest Neighbor In Context </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zihao Li</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+Y">Yuan Cao</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+C">Cheng Gao</a>, <a href="/search/cs?searchtype=author&query=He%2C+Y">Yihan He</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Han Liu</a>, <a href="/search/cs?searchtype=author&query=Klusowski%2C+J+M">Jason M. Klusowski</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+J">Jianqing Fan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+M">Mengdi Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10830v1-abstract-short" style="display: inline;"> Transformers have achieved great success in recent years. Interestingly, transformers have shown particularly strong in-context learning capability -- even without fine-tuning, they are still able to solve unseen tasks well purely based on task-specific prompts. In this paper, we study the capability of one-layer transformers in learning one of the most classical nonparametric estimators, the one-… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10830v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10830v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10830v1-abstract-full" style="display: none;"> Transformers have achieved great success in recent years. Interestingly, transformers have shown particularly strong in-context learning capability -- even without fine-tuning, they are still able to solve unseen tasks well purely based on task-specific prompts. In this paper, we study the capability of one-layer transformers in learning one of the most classical nonparametric estimators, the one-nearest neighbor prediction rule. Under a theoretical framework where the prompt contains a sequence of labeled training data and unlabeled test data, we show that, although the loss function is nonconvex when trained with gradient descent, a single softmax attention layer can successfully learn to behave like a one-nearest neighbor classifier. Our result gives a concrete example of how transformers can be trained to implement nonparametric machine learning algorithms, and sheds light on the role of softmax attention in transformer models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10830v1-abstract-full').style.display = 'none'; document.getElementById('2411.10830v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10720">arXiv:2411.10720</a> <span> [<a href="https://arxiv.org/pdf/2411.10720">pdf</a>, <a href="https://arxiv.org/format/2411.10720">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neurons and Cognition">q-bio.NC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Quantitative Methods">q-bio.QM</span> </div> </div> <p class="title is-5 mathjax"> Multi Scale Graph Neural Network for Alzheimer's Disease </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chauhan%2C+A">Anya Chauhan</a>, <a href="/search/cs?searchtype=author&query=Noori%2C+A">Ayush Noori</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhaozhi Li</a>, <a href="/search/cs?searchtype=author&query=He%2C+Y">Yingnan He</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M+M">Michelle M Li</a>, <a href="/search/cs?searchtype=author&query=Zitnik%2C+M">Marinka Zitnik</a>, <a href="/search/cs?searchtype=author&query=Das%2C+S">Sudeshna Das</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10720v1-abstract-short" style="display: inline;"> Alzheimer's disease (AD) is a complex, progressive neurodegenerative disorder characterized by extracellular A\b{eta} plaques, neurofibrillary tau tangles, glial activation, and neuronal degeneration, involving multiple cell types and pathways. Current models often overlook the cellular context of these pathways. To address this, we developed a multiscale graph neural network (GNN) model, ALZ PINN… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10720v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10720v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10720v1-abstract-full" style="display: none;"> Alzheimer's disease (AD) is a complex, progressive neurodegenerative disorder characterized by extracellular A\b{eta} plaques, neurofibrillary tau tangles, glial activation, and neuronal degeneration, involving multiple cell types and pathways. Current models often overlook the cellular context of these pathways. To address this, we developed a multiscale graph neural network (GNN) model, ALZ PINNACLE, using brain omics data from donors spanning the entire aging to AD spectrum. ALZ PINNACLE is based on the PINNACLE GNN framework, which learns context-aware protein, cell type, and tissue representations within a unified latent space. ALZ PINNACLE was trained on 14,951 proteins, 206,850 protein interactions, 7 cell types, and 48 cell subtypes or states. After pretraining, we investigated the learned embedding of APOE, the largest genetic risk factor for AD, across different cell types. Notably, APOE embeddings showed high similarity in microglial, neuronal, and CD8 cells, suggesting a similar role of APOE in these cell types. Fine tuning the model on AD risk genes revealed cell type contexts predictive of the role of APOE in AD. Our results suggest that ALZ PINNACLE may provide a valuable framework for uncovering novel insights into AD neurobiology. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10720v1-abstract-full').style.display = 'none'; document.getElementById('2411.10720v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Findings paper presented at Machine Learning for Health (ML4H) symposium 2024, December 15-16, 2024, Vancouver, Canada, 9 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10461">arXiv:2411.10461</a> <span> [<a href="https://arxiv.org/pdf/2411.10461">pdf</a>, <a href="https://arxiv.org/format/2411.10461">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Utilizing Human Behavior Modeling to Manipulate Explanations in AI-Assisted Decision Making: The Good, the Bad, and the Scary </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhuoyan Li</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+M">Ming Yin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10461v1-abstract-short" style="display: inline;"> Recent advances in AI models have increased the integration of AI-based decision aids into the human decision making process. To fully unlock the potential of AI-assisted decision making, researchers have computationally modeled how humans incorporate AI recommendations into their final decisions, and utilized these models to improve human-AI team performance. Meanwhile, due to the ``black-box'' n… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10461v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10461v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10461v1-abstract-full" style="display: none;"> Recent advances in AI models have increased the integration of AI-based decision aids into the human decision making process. To fully unlock the potential of AI-assisted decision making, researchers have computationally modeled how humans incorporate AI recommendations into their final decisions, and utilized these models to improve human-AI team performance. Meanwhile, due to the ``black-box'' nature of AI models, providing AI explanations to human decision makers to help them rely on AI recommendations more appropriately has become a common practice. In this paper, we explore whether we can quantitatively model how humans integrate both AI recommendations and explanations into their decision process, and whether this quantitative understanding of human behavior from the learned model can be utilized to manipulate AI explanations, thereby nudging individuals towards making targeted decisions. Our extensive human experiments across various tasks demonstrate that human behavior can be easily influenced by these manipulated explanations towards targeted outcomes, regardless of the intent being adversarial or benign. Furthermore, individuals often fail to detect any anomalies in these explanations, despite their decisions being affected by them. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10461v1-abstract-full').style.display = 'none'; document.getElementById('2411.10461v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10232">arXiv:2411.10232</a> <span> [<a href="https://arxiv.org/pdf/2411.10232">pdf</a>, <a href="https://arxiv.org/format/2411.10232">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> ColorEdit: Training-free Image-Guided Color editing with diffusion model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yin%2C+X">Xingxi Yin</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhi Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jingfeng Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chenglin Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yin Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10232v1-abstract-short" style="display: inline;"> Text-to-image (T2I) diffusion models, with their impressive generative capabilities, have been adopted for image editing tasks, demonstrating remarkable efficacy. However, due to attention leakage and collision between the cross-attention map of the object and the new color attribute from the text prompt, text-guided image editing methods may fail to change the color of an object, resulting in a m… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10232v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10232v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10232v1-abstract-full" style="display: none;"> Text-to-image (T2I) diffusion models, with their impressive generative capabilities, have been adopted for image editing tasks, demonstrating remarkable efficacy. However, due to attention leakage and collision between the cross-attention map of the object and the new color attribute from the text prompt, text-guided image editing methods may fail to change the color of an object, resulting in a misalignment between the resulting image and the text prompt. In this paper, we conduct an in-depth analysis on the process of text-guided image synthesizing and what semantic information different cross-attention blocks have learned. We observe that the visual representation of an object is determined in the up-block of the diffusion model in the early stage of the denoising process, and color adjustment can be achieved through value matrices alignment in the cross-attention layer. Based on our findings, we propose a straightforward, yet stable, and effective image-guided method to modify the color of an object without requiring any additional fine-tuning or training. Lastly, we present a benchmark dataset called COLORBENCH, the first benchmark to evaluate the performance of color change methods. Extensive experiments validate the effectiveness of our method in object-level color editing and surpass the performance of popular text-guided image editing approaches in both synthesized and real images. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10232v1-abstract-full').style.display = 'none'; document.getElementById('2411.10232v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09981">arXiv:2411.09981</a> <span> [<a href="https://arxiv.org/pdf/2411.09981">pdf</a>, <a href="https://arxiv.org/format/2411.09981">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> SoK: Consensus for Fair Message Ordering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhuolun Li</a>, <a href="/search/cs?searchtype=author&query=Pournaras%2C+E">Evangelos Pournaras</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09981v1-abstract-short" style="display: inline;"> Distributed ledger systems, such as blockchains, rely on consensus protocols that constantly commit messages in an agreed order for processing. In practice, message ordering within these systems is often reward-driven. This raises concerns about fairness, particularly in decentralized finance applications, where nodes can exploit transaction orders to maximize rewards (Maximal Extractable Value, M… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09981v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09981v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09981v1-abstract-full" style="display: none;"> Distributed ledger systems, such as blockchains, rely on consensus protocols that constantly commit messages in an agreed order for processing. In practice, message ordering within these systems is often reward-driven. This raises concerns about fairness, particularly in decentralized finance applications, where nodes can exploit transaction orders to maximize rewards (Maximal Extractable Value, MEV). This paper provides a structured review of consensus protocols that order messages with different approaches, especially focusing on the ones that promote order fairness, using methods including First-In-First-Out (FIFO), random, and blind ordering. We review the challenges and trade-offs of deriving fair message ordering in a Byzantine fault-tolerant setting, and summarize the key steps for making a fair message ordering consensus protocol. We introduce a design guideline, with which we propose a performance optimization to the state-of-the-art FIFO ordering protocol Themis. This work establishes a unified framework for accessing and enhancing fairness in distributed ledger systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09981v1-abstract-full').style.display = 'none'; document.getElementById('2411.09981v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09863">arXiv:2411.09863</a> <span> [<a href="https://arxiv.org/pdf/2411.09863">pdf</a>, <a href="https://arxiv.org/format/2411.09863">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Face De-identification: State-of-the-art Methods and Comparative Studies </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cao%2C+J">Jingyi Cao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xiangyi Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+B">Bo Liu</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+M">Ming Ding</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+R">Rong Xie</a>, <a href="/search/cs?searchtype=author&query=Song%2C+L">Li Song</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhu Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wenjun Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09863v1-abstract-short" style="display: inline;"> The widespread use of image acquisition technologies, along with advances in facial recognition, has raised serious privacy concerns. Face de-identification usually refers to the process of concealing or replacing personal identifiers, which is regarded as an effective means to protect the privacy of facial images. A significant number of methods for face de-identification have been proposed in re… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09863v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09863v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09863v1-abstract-full" style="display: none;"> The widespread use of image acquisition technologies, along with advances in facial recognition, has raised serious privacy concerns. Face de-identification usually refers to the process of concealing or replacing personal identifiers, which is regarded as an effective means to protect the privacy of facial images. A significant number of methods for face de-identification have been proposed in recent years. In this survey, we provide a comprehensive review of state-of-the-art face de-identification methods, categorized into three levels: pixel-level, representation-level, and semantic-level techniques. We systematically evaluate these methods based on two key criteria, the effectiveness of privacy protection and preservation of image utility, highlighting their advantages and limitations. Our analysis includes qualitative and quantitative comparisons of the main algorithms, demonstrating that deep learning-based approaches, particularly those using Generative Adversarial Networks (GANs) and diffusion models, have achieved significant advancements in balancing privacy and utility. Experimental results reveal that while recent methods demonstrate strong privacy protection, trade-offs remain in visual fidelity and computational complexity. This survey not only summarizes the current landscape but also identifies key challenges and future research directions in face de-identification. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09863v1-abstract-full').style.display = 'none'; document.getElementById('2411.09863v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09760">arXiv:2411.09760</a> <span> [<a href="https://arxiv.org/pdf/2411.09760">pdf</a>, <a href="https://arxiv.org/format/2411.09760">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Emerging Technologies">cs.ET</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> SpecPCM: A Low-power PCM-based In-Memory Computing Accelerator for Full-stack Mass Spectrometry Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fan%2C+K">Keming Fan</a>, <a href="/search/cs?searchtype=author&query=Moradifirouzabadi%2C+A">Ashkan Moradifirouzabadi</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+X">Xiangjin Wu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zheyu Li</a>, <a href="/search/cs?searchtype=author&query=Ponzina%2C+F">Flavio Ponzina</a>, <a href="/search/cs?searchtype=author&query=Persson%2C+A">Anton Persson</a>, <a href="/search/cs?searchtype=author&query=Pop%2C+E">Eric Pop</a>, <a href="/search/cs?searchtype=author&query=Rosing%2C+T">Tajana Rosing</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+M">Mingu Kang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09760v1-abstract-short" style="display: inline;"> Mass spectrometry (MS) is essential for proteomics and metabolomics but faces impending challenges in efficiently processing the vast volumes of data. This paper introduces SpecPCM, an in-memory computing (IMC) accelerator designed to achieve substantial improvements in energy and delay efficiency for both MS spectral clustering and database (DB) search. SpecPCM employs analog processing with low-… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09760v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09760v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09760v1-abstract-full" style="display: none;"> Mass spectrometry (MS) is essential for proteomics and metabolomics but faces impending challenges in efficiently processing the vast volumes of data. This paper introduces SpecPCM, an in-memory computing (IMC) accelerator designed to achieve substantial improvements in energy and delay efficiency for both MS spectral clustering and database (DB) search. SpecPCM employs analog processing with low-voltage swing and utilizes recently introduced phase change memory (PCM) devices based on superlattice materials, optimized for low-voltage and low-power programming. Our approach integrates contributions across multiple levels: application, algorithm, circuit, device, and instruction sets. We leverage a robust hyperdimensional computing (HD) algorithm with a novel dimension-packing method and develop specialized hardware for the end-to-end MS pipeline to overcome the non-ideal behavior of PCM devices. We further optimize multi-level PCM devices for different tasks by using different materials. We also perform a comprehensive design exploration to improve energy and delay efficiency while maintaining accuracy, exploring various combinations of hardware and software parameters controlled by the instruction set architecture (ISA). SpecPCM, with up to three bits per cell, achieves speedups of up to 82x and 143x for MS clustering and DB search tasks, respectively, along with a four-orders-of-magnitude improvement in energy efficiency compared with state-of-the-art CPU/GPU tools. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09760v1-abstract-full').style.display = 'none'; document.getElementById('2411.09760v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09691">arXiv:2411.09691</a> <span> [<a href="https://arxiv.org/pdf/2411.09691">pdf</a>, <a href="https://arxiv.org/format/2411.09691">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Advancing Fine-Grained Visual Understanding with Multi-Scale Alignment in Multi-Modal Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wei Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhaowei Li</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Q">Qi Xu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Linfeng Li</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+Y">YiQing Cai</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+B">Botian Jiang</a>, <a href="/search/cs?searchtype=author&query=Song%2C+H">Hang Song</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+X">Xingcan Hu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+P">Pengyu Wang</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+L">Li Xiao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09691v1-abstract-short" style="display: inline;"> Multi-modal large language models (MLLMs) have achieved remarkable success in fine-grained visual understanding across a range of tasks. However, they often encounter significant challenges due to inadequate alignment for fine-grained knowledge, which restricts their ability to accurately capture local details and attain a comprehensive global perception. While recent advancements have focused on… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09691v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09691v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09691v1-abstract-full" style="display: none;"> Multi-modal large language models (MLLMs) have achieved remarkable success in fine-grained visual understanding across a range of tasks. However, they often encounter significant challenges due to inadequate alignment for fine-grained knowledge, which restricts their ability to accurately capture local details and attain a comprehensive global perception. While recent advancements have focused on aligning object expressions with grounding information, they typically lack explicit integration of object images, which contain affluent information beyond mere texts or coordinates. To bridge this gap, we introduce a novel fine-grained visual knowledge alignment method that effectively aligns and integrates multi-scale knowledge of objects, including texts, coordinates, and images. This innovative method is underpinned by our multi-scale fine-grained enhancement data synthesis pipeline, which provides over 300K essential training data to enhance alignment and improve overall performance. Furthermore, we present TinyGroundingGPT, a series of compact models optimized for high-level alignments. With a scale of approximately 3B parameters, TinyGroundingGPT achieves outstanding results in grounding tasks while delivering performance comparable to larger MLLMs in complex visual scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09691v1-abstract-full').style.display = 'none'; document.getElementById('2411.09691v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09301">arXiv:2411.09301</a> <span> [<a href="https://arxiv.org/pdf/2411.09301">pdf</a>, <a href="https://arxiv.org/format/2411.09301">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> LHRS-Bot-Nova: Improved Multimodal Large Language Model for Remote Sensing Vision-Language Interpretation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhenshi Li</a>, <a href="/search/cs?searchtype=author&query=Muhtar%2C+D">Dilxat Muhtar</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+F">Feng Gu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xueliang Zhang</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+P">Pengfeng Xiao</a>, <a href="/search/cs?searchtype=author&query=He%2C+G">Guangjun He</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+X">Xiaoxiang Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09301v1-abstract-short" style="display: inline;"> Automatically and rapidly understanding Earth's surface is fundamental to our grasp of the living environment and informed decision-making. This underscores the need for a unified system with comprehensive capabilities in analyzing Earth's surface to address a wide range of human needs. The emergence of multimodal large language models (MLLMs) has great potential in boosting the efficiency and con… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09301v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09301v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09301v1-abstract-full" style="display: none;"> Automatically and rapidly understanding Earth's surface is fundamental to our grasp of the living environment and informed decision-making. This underscores the need for a unified system with comprehensive capabilities in analyzing Earth's surface to address a wide range of human needs. The emergence of multimodal large language models (MLLMs) has great potential in boosting the efficiency and convenience of intelligent Earth observation. These models can engage in human-like conversations, serve as unified platforms for understanding images, follow diverse instructions, and provide insightful feedbacks. In this study, we introduce LHRS-Bot-Nova, an MLLM specialized in understanding remote sensing (RS) images, designed to expertly perform a wide range of RS understanding tasks aligned with human instructions. LHRS-Bot-Nova features an enhanced vision encoder and a novel bridge layer, enabling efficient visual compression and better language-vision alignment. To further enhance RS-oriented vision-language alignment, we propose a large-scale RS image-caption dataset, generated through feature-guided image recaptioning. Additionally, we introduce an instruction dataset specifically designed to improve spatial recognition abilities. Extensive experiments demonstrate superior performance of LHRS-Bot-Nova across various RS image understanding tasks. We also evaluate different MLLM performances in complex RS perception and instruction following using a complicated multi-choice question evaluation benchmark, providing a reliable guide for future model selection and improvement. Data, code, and models will be available at https://github.com/NJU-LHRS/LHRS-Bot. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09301v1-abstract-full').style.display = 'none'; document.getElementById('2411.09301v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09259">arXiv:2411.09259</a> <span> [<a href="https://arxiv.org/pdf/2411.09259">pdf</a>, <a href="https://arxiv.org/format/2411.09259">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Jailbreak Attacks and Defenses against Multimodal Generative Models: A Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xuannan Liu</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+X">Xing Cui</a>, <a href="/search/cs?searchtype=author&query=Li%2C+P">Peipei Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zekun Li</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+H">Huaibo Huang</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+S">Shuhan Xia</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Miaoxuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+Y">Yueying Zou</a>, <a href="/search/cs?searchtype=author&query=He%2C+R">Ran He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09259v1-abstract-short" style="display: inline;"> The rapid evolution of multimodal foundation models has led to significant advancements in cross-modal understanding and generation across diverse modalities, including text, images, audio, and video. However, these models remain susceptible to jailbreak attacks, which can bypass built-in safety mechanisms and induce the production of potentially harmful content. Consequently, understanding the me… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09259v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09259v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09259v1-abstract-full" style="display: none;"> The rapid evolution of multimodal foundation models has led to significant advancements in cross-modal understanding and generation across diverse modalities, including text, images, audio, and video. However, these models remain susceptible to jailbreak attacks, which can bypass built-in safety mechanisms and induce the production of potentially harmful content. Consequently, understanding the methods of jailbreak attacks and existing defense mechanisms is essential to ensure the safe deployment of multimodal generative models in real-world scenarios, particularly in security-sensitive applications. To provide comprehensive insight into this topic, this survey reviews jailbreak and defense in multimodal generative models. First, given the generalized lifecycle of multimodal jailbreak, we systematically explore attacks and corresponding defense strategies across four levels: input, encoder, generator, and output. Based on this analysis, we present a detailed taxonomy of attack methods, defense mechanisms, and evaluation frameworks specific to multimodal generative models. Additionally, we cover a wide range of input-output configurations, including modalities such as Any-to-Text, Any-to-Vision, and Any-to-Any within generative systems. Finally, we highlight current research challenges and propose potential directions for future research.The open-source repository corresponding to this work can be found at https://github.com/liuxuannan/Awesome-Multimodal-Jailbreak. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09259v1-abstract-full').style.display = 'none'; document.getElementById('2411.09259v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ongoing work</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09220">arXiv:2411.09220</a> <span> [<a href="https://arxiv.org/pdf/2411.09220">pdf</a>, <a href="https://arxiv.org/format/2411.09220">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Transferable Adversarial Attacks against ASR </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gao%2C+X">Xiaoxue Gao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zexin Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yiming Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+C">Cong Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Haizhou Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09220v1-abstract-short" style="display: inline;"> Given the extensive research and real-world applications of automatic speech recognition (ASR), ensuring the robustness of ASR models against minor input perturbations becomes a crucial consideration for maintaining their effectiveness in real-time scenarios. Previous explorations into ASR model robustness have predominantly revolved around evaluating accuracy on white-box settings with full acces… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09220v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09220v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09220v1-abstract-full" style="display: none;"> Given the extensive research and real-world applications of automatic speech recognition (ASR), ensuring the robustness of ASR models against minor input perturbations becomes a crucial consideration for maintaining their effectiveness in real-time scenarios. Previous explorations into ASR model robustness have predominantly revolved around evaluating accuracy on white-box settings with full access to ASR models. Nevertheless, full ASR model details are often not available in real-world applications. Therefore, evaluating the robustness of black-box ASR models is essential for a comprehensive understanding of ASR model resilience. In this regard, we thoroughly study the vulnerability of practical black-box attacks in cutting-edge ASR models and propose to employ two advanced time-domain-based transferable attacks alongside our differentiable feature extractor. We also propose a speech-aware gradient optimization approach (SAGO) for ASR, which forces mistranscription with minimal impact on human imperceptibility through voice activity detection rule and a speech-aware gradient-oriented optimizer. Our comprehensive experimental results reveal performance enhancements compared to baseline approaches across five models on two databases. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09220v1-abstract-full').style.display = 'none'; document.getElementById('2411.09220v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">IEEE SPL</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09105">arXiv:2411.09105</a> <span> [<a href="https://arxiv.org/pdf/2411.09105">pdf</a>, <a href="https://arxiv.org/format/2411.09105">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> VCBench: A Controllable Benchmark for Symbolic and Abstract Challenges in Video Cognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+C">Chenglin Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Q">Qianglong Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhi Li</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+F">Feng Tao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yin Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09105v1-abstract-short" style="display: inline;"> Recent advancements in Large Video-Language Models (LVLMs) have driven the development of benchmarks designed to assess cognitive abilities in video-based tasks. However, most existing benchmarks heavily rely on web-collected videos paired with human annotations or model-generated questions, which limit control over the video content and fall short in evaluating advanced cognitive abilities involv… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09105v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09105v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09105v1-abstract-full" style="display: none;"> Recent advancements in Large Video-Language Models (LVLMs) have driven the development of benchmarks designed to assess cognitive abilities in video-based tasks. However, most existing benchmarks heavily rely on web-collected videos paired with human annotations or model-generated questions, which limit control over the video content and fall short in evaluating advanced cognitive abilities involving symbolic elements and abstract concepts. To address these limitations, we introduce VCBench, a controllable benchmark to assess LVLMs' cognitive abilities, involving symbolic and abstract concepts at varying difficulty levels. By generating video data with the Python-based engine, VCBench allows for precise control over the video content, creating dynamic, task-oriented videos that feature complex scenes and abstract concepts. Each task pairs with tailored question templates that target specific cognitive challenges, providing a rigorous evaluation test. Our evaluation reveals that even state-of-the-art (SOTA) models, such as Qwen2-VL-72B, struggle with simple video cognition tasks involving abstract concepts, with performance sharply dropping by 19% as video complexity rises. These findings reveal the current limitations of LVLMs in advanced cognitive tasks and highlight the critical role of VCBench in driving research toward more robust LVLMs for complex video cognition challenges. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09105v1-abstract-full').style.display = 'none'; document.getElementById('2411.09105v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07685">arXiv:2411.07685</a> <span> [<a href="https://arxiv.org/pdf/2411.07685">pdf</a>, <a href="https://arxiv.org/format/2411.07685">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Fast Disentangled Slim Tensor Learning for Multi-view Clustering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+D">Deng Xu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chao Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zechao Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C">Chunlin Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Huaxiong Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07685v1-abstract-short" style="display: inline;"> Tensor-based multi-view clustering has recently received significant attention due to its exceptional ability to explore cross-view high-order correlations. However, most existing methods still encounter some limitations. (1) Most of them explore the correlations among different affinity matrices, making them unscalable to large-scale data. (2) Although some methods address it by introducing bipar… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07685v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07685v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07685v1-abstract-full" style="display: none;"> Tensor-based multi-view clustering has recently received significant attention due to its exceptional ability to explore cross-view high-order correlations. However, most existing methods still encounter some limitations. (1) Most of them explore the correlations among different affinity matrices, making them unscalable to large-scale data. (2) Although some methods address it by introducing bipartite graphs, they may result in sub-optimal solutions caused by an unstable anchor selection process. (3) They generally ignore the negative impact of latent semantic-unrelated information in each view. To tackle these issues, we propose a new approach termed fast Disentangled Slim Tensor Learning (DSTL) for multi-view clustering . Instead of focusing on the multi-view graph structures, DSTL directly explores the high-order correlations among multi-view latent semantic representations based on matrix factorization. To alleviate the negative influence of feature redundancy, inspired by robust PCA, DSTL disentangles the latent low-dimensional representation into a semantic-unrelated part and a semantic-related part for each view. Subsequently, two slim tensors are constructed with tensor-based regularization. To further enhance the quality of feature disentanglement, the semantic-related representations are aligned across views through a consensus alignment indicator. Our proposed model is computationally efficient and can be solved effectively. Extensive experiments demonstrate the superiority and efficiency of DSTL over state-of-the-art approaches. The code of DSTL is available at https://github.com/dengxu-nju/DSTL. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07685v1-abstract-full').style.display = 'none'; document.getElementById('2411.07685v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages,6 figures, will be published to IEEE TMM</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07506">arXiv:2411.07506</a> <span> [<a href="https://arxiv.org/pdf/2411.07506">pdf</a>, <a href="https://arxiv.org/format/2411.07506">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> FM-TS: Flow Matching for Time Series Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yang Hu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiao Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+L">Lirong Wu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Huatian Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S+Z">Stan Z. Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Sheng Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tianlong Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07506v1-abstract-short" style="display: inline;"> Time series generation has emerged as an essential tool for analyzing temporal data across numerous fields. While diffusion models have recently gained significant attention in generating high-quality time series, they tend to be computationally demanding and reliant on complex stochastic processes. To address these limitations, we introduce FM-TS, a rectified Flow Matching-based framework for Tim… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07506v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07506v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07506v1-abstract-full" style="display: none;"> Time series generation has emerged as an essential tool for analyzing temporal data across numerous fields. While diffusion models have recently gained significant attention in generating high-quality time series, they tend to be computationally demanding and reliant on complex stochastic processes. To address these limitations, we introduce FM-TS, a rectified Flow Matching-based framework for Time Series generation, which simplifies the time series generation process by directly optimizing continuous trajectories. This approach avoids the need for iterative sampling or complex noise schedules typically required in diffusion-based models. FM-TS is more efficient in terms of training and inference. Moreover, FM-TS is highly adaptive, supporting both conditional and unconditional time series generation. Notably, through our novel inference design, the model trained in an unconditional setting can seamlessly generalize to conditional tasks without the need for retraining. Extensive benchmarking across both settings demonstrates that FM-TS consistently delivers superior performance compared to existing approaches while being more efficient in terms of training and inference. For instance, in terms of discriminative score, FM-TS achieves 0.005, 0.019, 0.011, 0.005, 0.053, and 0.106 on the Sines, Stocks, ETTh, MuJoCo, Energy, and fMRI unconditional time series datasets, respectively, significantly outperforming the second-best method which achieves 0.006, 0.067, 0.061, 0.008, 0.122, and 0.167 on the same datasets. We have achieved superior performance in solar forecasting and MuJoCo imputation tasks, significantly enhanced by our innovative $t$ power sampling method. The code is available at https://github.com/UNITES-Lab/FMTS. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07506v1-abstract-full').style.display = 'none'; document.getElementById('2411.07506v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07480">arXiv:2411.07480</a> <span> [<a href="https://arxiv.org/pdf/2411.07480">pdf</a>, <a href="https://arxiv.org/format/2411.07480">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> Discovery of Timeline and Crowd Reaction of Software Vulnerability Disclosures </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Heng%2C+Y+W">Yi Wen Heng</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Z">Zeyang Ma</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Haoxiang Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhenhao Li</a>, <a href="/search/cs?searchtype=author&query=Tse-Hsun"> Tse-Hsun</a>, <a href="/search/cs?searchtype=author&query=Chen"> Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07480v3-abstract-short" style="display: inline;"> Reusing third-party libraries increases productivity and saves time and costs for developers. However, the downside is the presence of vulnerabilities in those libraries, which can lead to catastrophic outcomes. For instance, Apache Log4J was found to be vulnerable to remote code execution attacks. A total of more than 35,000 packages were forced to update their Log4J libraries with the latest ver… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07480v3-abstract-full').style.display = 'inline'; document.getElementById('2411.07480v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07480v3-abstract-full" style="display: none;"> Reusing third-party libraries increases productivity and saves time and costs for developers. However, the downside is the presence of vulnerabilities in those libraries, which can lead to catastrophic outcomes. For instance, Apache Log4J was found to be vulnerable to remote code execution attacks. A total of more than 35,000 packages were forced to update their Log4J libraries with the latest version. Although several studies have been conducted to predict software vulnerabilities, the prediction does not cover the vulnerabilities found in third-party libraries. Even if the developers are aware of the forthcoming issue, replicating a function similar to the libraries would be time-consuming and labour-intensive. Nevertheless, it is practically reasonable for software developers to update their third-party libraries (and dependencies) whenever the software vendors have released a vulnerable-free version. In this work, our manual study focuses on the real-world practices (crowd reaction) adopted by software vendors and developer communities when a vulnerability is disclosed. We manually investigated 312 CVEs and identified that the primary trend of vulnerability handling is to provide a fix before publishing an announcement. Otherwise, developers wait an average of 10 days for a fix if it is unavailable upon the announcement. Additionally, the crowd reaction is oblivious to the vulnerability severity. In particular, we identified Oracle as the most vibrant community diligent in releasing fixes. Their software developers also actively participate in the associated vulnerability announcements. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07480v3-abstract-full').style.display = 'none'; document.getElementById('2411.07480v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07445">arXiv:2411.07445</a> <span> [<a href="https://arxiv.org/pdf/2411.07445">pdf</a>, <a href="https://arxiv.org/format/2411.07445">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> All-in-one Weather-degraded Image Restoration via Adaptive Degradation-aware Self-prompting Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wen%2C+Y">Yuanbo Wen</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+T">Tao Gao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Ziqi Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jing Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kaihao Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Ting Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07445v1-abstract-short" style="display: inline;"> Existing approaches for all-in-one weather-degraded image restoration suffer from inefficiencies in leveraging degradation-aware priors, resulting in sub-optimal performance in adapting to different weather conditions. To this end, we develop an adaptive degradation-aware self-prompting model (ADSM) for all-in-one weather-degraded image restoration. Specifically, our model employs the contrastive… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07445v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07445v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07445v1-abstract-full" style="display: none;"> Existing approaches for all-in-one weather-degraded image restoration suffer from inefficiencies in leveraging degradation-aware priors, resulting in sub-optimal performance in adapting to different weather conditions. To this end, we develop an adaptive degradation-aware self-prompting model (ADSM) for all-in-one weather-degraded image restoration. Specifically, our model employs the contrastive language-image pre-training model (CLIP) to facilitate the training of our proposed latent prompt generators (LPGs), which represent three types of latent prompts to characterize the degradation type, degradation property and image caption. Moreover, we integrate the acquired degradation-aware prompts into the time embedding of diffusion model to improve degradation perception. Meanwhile, we employ the latent caption prompt to guide the reverse sampling process using the cross-attention mechanism, thereby guiding the accurate image reconstruction. Furthermore, to accelerate the reverse sampling procedure of diffusion model and address the limitations of frequency perception, we introduce a wavelet-oriented noise estimating network (WNE-Net). Extensive experiments conducted on eight publicly available datasets demonstrate the effectiveness of our proposed approach in both task-specific and all-in-one applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07445v1-abstract-full').style.display = 'none'; document.getElementById('2411.07445v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07381">arXiv:2411.07381</a> <span> [<a href="https://arxiv.org/pdf/2411.07381">pdf</a>, <a href="https://arxiv.org/format/2411.07381">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> BeeManc at the PLABA Track of TAC-2024: RoBERTa for task 1 -- LLaMA3.1 and GPT-4o for task 2 </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ling%2C+Z">Zhidong Ling</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zihao Li</a>, <a href="/search/cs?searchtype=author&query=Romero%2C+P">Pablo Romero</a>, <a href="/search/cs?searchtype=author&query=Han%2C+L">Lifeng Han</a>, <a href="/search/cs?searchtype=author&query=Nenadic%2C+G">Goran Nenadic</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07381v2-abstract-short" style="display: inline;"> This report is the system description of the BeeManc team for shared task Plain Language Adaptation of Biomedical Abstracts (PLABA) 2024. This report contains two sections corresponding to the two sub-tasks in PLABA 2024. In task one, we applied fine-tuned ReBERTa-Base models to identify and classify the difficult terms, jargon and acronyms in the biomedical abstracts and reported the F1 score. Du… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07381v2-abstract-full').style.display = 'inline'; document.getElementById('2411.07381v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07381v2-abstract-full" style="display: none;"> This report is the system description of the BeeManc team for shared task Plain Language Adaptation of Biomedical Abstracts (PLABA) 2024. This report contains two sections corresponding to the two sub-tasks in PLABA 2024. In task one, we applied fine-tuned ReBERTa-Base models to identify and classify the difficult terms, jargon and acronyms in the biomedical abstracts and reported the F1 score. Due to time constraints, we didn't finish the replacement task. In task two, we leveraged Llamma3.1-70B-Instruct and GPT-4o with the one-shot prompts to complete the abstract adaptation and reported the scores in BLEU, SARI, BERTScore, LENS, and SALSA. From the official Evaluation from PLABA-2024 on Task 1A and 1B, our \textbf{much smaller fine-tuned RoBERTa-Base} model ranked 3rd and 2nd respectively on the two sub-task, and the \textbf{1st on averaged F1 scores across the two tasks} from 9 evaluated systems. Our LLaMA-3.1-70B-instructed model achieved the \textbf{highest Completeness} score for Task-2. We share our fine-tuned models and related resources at \url{https://github.com/HECTA-UoM/PLABA2024} <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07381v2-abstract-full').style.display = 'none'; document.getElementById('2411.07381v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ongoing work - system report</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07135">arXiv:2411.07135</a> <span> [<a href="https://arxiv.org/pdf/2411.07135">pdf</a>, <a href="https://arxiv.org/format/2411.07135">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> Edify 3D: Scalable High-Quality 3D Asset Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=NVIDIA"> NVIDIA</a>, <a href="/search/cs?searchtype=author&query=%3A"> :</a>, <a href="/search/cs?searchtype=author&query=Bala%2C+M">Maciej Bala</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+Y">Yin Cui</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+Y">Yifan Ding</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+Y">Yunhao Ge</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+Z">Zekun Hao</a>, <a href="/search/cs?searchtype=author&query=Hasselgren%2C+J">Jon Hasselgren</a>, <a href="/search/cs?searchtype=author&query=Huffman%2C+J">Jacob Huffman</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+J">Jingyi Jin</a>, <a href="/search/cs?searchtype=author&query=Lewis%2C+J+P">J. P. Lewis</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhaoshuo Li</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+C">Chen-Hsuan Lin</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yen-Chen Lin</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+T">Tsung-Yi Lin</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+M">Ming-Yu Liu</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+A">Alice Luo</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Q">Qianli Ma</a>, <a href="/search/cs?searchtype=author&query=Munkberg%2C+J">Jacob Munkberg</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+S">Stella Shi</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+F">Fangyin Wei</a>, <a href="/search/cs?searchtype=author&query=Xiang%2C+D">Donglai Xiang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jiashu Xu</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+X">Xiaohui Zeng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Qinsheng Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07135v1-abstract-short" style="display: inline;"> We introduce Edify 3D, an advanced solution designed for high-quality 3D asset generation. Our method first synthesizes RGB and surface normal images of the described object at multiple viewpoints using a diffusion model. The multi-view observations are then used to reconstruct the shape, texture, and PBR materials of the object. Our method can generate high-quality 3D assets with detailed geometr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07135v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07135v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07135v1-abstract-full" style="display: none;"> We introduce Edify 3D, an advanced solution designed for high-quality 3D asset generation. Our method first synthesizes RGB and surface normal images of the described object at multiple viewpoints using a diffusion model. The multi-view observations are then used to reconstruct the shape, texture, and PBR materials of the object. Our method can generate high-quality 3D assets with detailed geometry, clean shape topologies, high-resolution textures, and materials within 2 minutes of runtime. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07135v1-abstract-full').style.display = 'none'; document.getElementById('2411.07135v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project website: https://research.nvidia.com/labs/dir/edify-3d</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06976">arXiv:2411.06976</a> <span> [<a href="https://arxiv.org/pdf/2411.06976">pdf</a>, <a href="https://arxiv.org/format/2411.06976">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> A Hierarchical Compression Technique for 3D Gaussian Splatting Compression </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+H">He Huang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+W">Wenjie Huang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Q">Qi Yang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yiling Xu</a>, <a href="/search/cs?searchtype=author&query=li%2C+Z">Zhu li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06976v1-abstract-short" style="display: inline;"> 3D Gaussian Splatting (GS) demonstrates excellent rendering quality and generation speed in novel view synthesis. However, substantial data size poses challenges for storage and transmission, making 3D GS compression an essential technology. Current 3D GS compression research primarily focuses on developing more compact scene representations, such as converting explicit 3D GS data into implicit fo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06976v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06976v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06976v1-abstract-full" style="display: none;"> 3D Gaussian Splatting (GS) demonstrates excellent rendering quality and generation speed in novel view synthesis. However, substantial data size poses challenges for storage and transmission, making 3D GS compression an essential technology. Current 3D GS compression research primarily focuses on developing more compact scene representations, such as converting explicit 3D GS data into implicit forms. In contrast, compression of the GS data itself has hardly been explored. To address this gap, we propose a Hierarchical GS Compression (HGSC) technique. Initially, we prune unimportant Gaussians based on importance scores derived from both global and local significance, effectively reducing redundancy while maintaining visual quality. An Octree structure is used to compress 3D positions. Based on the 3D GS Octree, we implement a hierarchical attribute compression strategy by employing a KD-tree to partition the 3D GS into multiple blocks. We apply farthest point sampling to select anchor primitives within each block and others as non-anchor primitives with varying Levels of Details (LoDs). Anchor primitives serve as reference points for predicting non-anchor primitives across different LoDs to reduce spatial redundancy. For anchor primitives, we use the region adaptive hierarchical transform to achieve near-lossless compression of various attributes. For non-anchor primitives, each is predicted based on the k-nearest anchor primitives. To further minimize prediction errors, the reconstructed LoD and anchor primitives are combined to form new anchor primitives to predict the next LoD. Our method notably achieves superior compression quality and a significant data size reduction of over 4.5 times compared to the state-of-the-art compression method on small scenes datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06976v1-abstract-full').style.display = 'none'; document.getElementById('2411.06976v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06893">arXiv:2411.06893</a> <span> [<a href="https://arxiv.org/pdf/2411.06893">pdf</a>, <a href="https://arxiv.org/format/2411.06893">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Multi-scale Frequency Enhancement Network for Blind Image Deblurring </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiang%2C+Y">Yawen Xiang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+H">Heng Zhou</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chengyang Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhongbo Li</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+Y">Yongqiang Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06893v1-abstract-short" style="display: inline;"> Image deblurring is an essential image preprocessing technique, aiming to recover clear and detailed images form blurry ones. However, existing algorithms often fail to effectively integrate multi-scale feature extraction with frequency enhancement, limiting their ability to reconstruct fine textures. Additionally, non-uniform blur in images also restricts the effectiveness of image restoration. T… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06893v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06893v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06893v1-abstract-full" style="display: none;"> Image deblurring is an essential image preprocessing technique, aiming to recover clear and detailed images form blurry ones. However, existing algorithms often fail to effectively integrate multi-scale feature extraction with frequency enhancement, limiting their ability to reconstruct fine textures. Additionally, non-uniform blur in images also restricts the effectiveness of image restoration. To address these issues, we propose a multi-scale frequency enhancement network (MFENet) for blind image deblurring. To capture the multi-scale spatial and channel information of blurred images, we introduce a multi-scale feature extraction module (MS-FE) based on depthwise separable convolutions, which provides rich target features for deblurring. We propose a frequency enhanced blur perception module (FEBP) that employs wavelet transforms to extract high-frequency details and utilizes multi-strip pooling to perceive non-uniform blur, combining multi-scale information with frequency enhancement to improve the restoration of image texture details. Experimental results on the GoPro and HIDE datasets demonstrate that the proposed method achieves superior deblurring performance in both visual quality and objective evaluation metrics. Furthermore, in downstream object detection tasks, the proposed blind image deblurring algorithm significantly improves detection accuracy, further validating its effectiveness androbustness in the field of image deblurring. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06893v1-abstract-full').style.display = 'none'; document.getElementById('2411.06893v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06723">arXiv:2411.06723</a> <span> [<a href="https://arxiv.org/pdf/2411.06723">pdf</a>, <a href="https://arxiv.org/format/2411.06723">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Script-Strategy Aligned Generation: Aligning LLMs with Expert-Crafted Dialogue Scripts and Therapeutic Strategies for Psychotherapy </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+X">Xin Sun</a>, <a href="/search/cs?searchtype=author&query=de+Wit%2C+J">Jan de Wit</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhuying Li</a>, <a href="/search/cs?searchtype=author&query=Pei%2C+J">Jiahuan Pei</a>, <a href="/search/cs?searchtype=author&query=Ali%2C+A+E">Abdallah El Ali</a>, <a href="/search/cs?searchtype=author&query=Bosch%2C+J+A">Jos A. Bosch</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06723v1-abstract-short" style="display: inline;"> Chatbots or conversational agents (CAs) are increasingly used to improve access to digital psychotherapy. Many current systems rely on rigid, rule-based designs, heavily dependent on expert-crafted dialogue scripts for guiding therapeutic conversations. Although recent advances in large language models (LLMs) offer the potential for more flexible interactions, their lack of controllability and tra… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06723v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06723v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06723v1-abstract-full" style="display: none;"> Chatbots or conversational agents (CAs) are increasingly used to improve access to digital psychotherapy. Many current systems rely on rigid, rule-based designs, heavily dependent on expert-crafted dialogue scripts for guiding therapeutic conversations. Although recent advances in large language models (LLMs) offer the potential for more flexible interactions, their lack of controllability and transparency poses significant challenges in sensitive areas like psychotherapy. In this work, we explored how aligning LLMs with expert-crafted scripts can enhance psychotherapeutic chatbot performance. Our comparative study showed that LLMs aligned with expert-crafted scripts through prompting and fine-tuning significantly outperformed both pure LLMs and rule-based chatbots, achieving a more effective balance between dialogue flexibility and adherence to therapeutic principles. Building on findings, we proposed ``Script-Strategy Aligned Generation (SSAG)'', a flexible alignment approach that reduces reliance on fully scripted content while enhancing LLMs' therapeutic adherence and controllability. In a 10-day field study, SSAG demonstrated performance comparable to full script alignment and outperformed rule-based chatbots, empirically supporting SSAG as an efficient approach for aligning LLMs with domain expertise. Our work advances LLM applications in psychotherapy by providing a controllable, adaptable, and scalable solution for digital interventions, reducing reliance on expert effort. It also provides a collaborative framework for domain experts and developers to efficiently build expertise-aligned chatbots, broadening access to psychotherapy and behavioral interventions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06723v1-abstract-full').style.display = 'none'; document.getElementById('2411.06723v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06685">arXiv:2411.06685</a> <span> [<a href="https://arxiv.org/pdf/2411.06685">pdf</a>, <a href="https://arxiv.org/format/2411.06685">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> High-Frequency Enhanced Hybrid Neural Representation for Video Compression </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+L">Li Yu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhihui Li</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+J">Jimin Xiao</a>, <a href="/search/cs?searchtype=author&query=Gabbouj%2C+M">Moncef Gabbouj</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06685v1-abstract-short" style="display: inline;"> Neural Representations for Videos (NeRV) have simplified the video codec process and achieved swift decoding speeds by encoding video content into a neural network, presenting a promising solution for video compression. However, existing work overlooks the crucial issue that videos reconstructed by these methods lack high-frequency details. To address this problem, this paper introduces a High-Fre… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06685v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06685v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06685v1-abstract-full" style="display: none;"> Neural Representations for Videos (NeRV) have simplified the video codec process and achieved swift decoding speeds by encoding video content into a neural network, presenting a promising solution for video compression. However, existing work overlooks the crucial issue that videos reconstructed by these methods lack high-frequency details. To address this problem, this paper introduces a High-Frequency Enhanced Hybrid Neural Representation Network. Our method focuses on leveraging high-frequency information to improve the synthesis of fine details by the network. Specifically, we design a wavelet high-frequency encoder that incorporates Wavelet Frequency Decomposer (WFD) blocks to generate high-frequency feature embeddings. Next, we design the High-Frequency Feature Modulation (HFM) block, which leverages the extracted high-frequency embeddings to enhance the fitting process of the decoder. Finally, with the refined Harmonic decoder block and a Dynamic Weighted Frequency Loss, we further reduce the potential loss of high-frequency information. Experiments on the Bunny and UVG datasets demonstrate that our method outperforms other methods, showing notable improvements in detail preservation and compression performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06685v1-abstract-full').style.display = 'none'; document.getElementById('2411.06685v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06518">arXiv:2411.06518</a> <span> [<a href="https://arxiv.org/pdf/2411.06518">pdf</a>, <a href="https://arxiv.org/format/2411.06518">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Quantitative Methods">q-bio.QM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Methodology">stat.ME</span> </div> </div> <p class="title is-5 mathjax"> Causal Representation Learning from Multimodal Biological Observations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yuewen Sun</a>, <a href="/search/cs?searchtype=author&query=Kong%2C+L">Lingjing Kong</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+G">Guangyi Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Loka Li</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+G">Gongxu Luo</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zijian Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yixuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Y">Yujia Zheng</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+M">Mengyue Yang</a>, <a href="/search/cs?searchtype=author&query=Stojanov%2C+P">Petar Stojanov</a>, <a href="/search/cs?searchtype=author&query=Segal%2C+E">Eran Segal</a>, <a href="/search/cs?searchtype=author&query=Xing%2C+E+P">Eric P. Xing</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kun Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06518v1-abstract-short" style="display: inline;"> Prevalent in biological applications (e.g., human phenotype measurements), multimodal datasets can provide valuable insights into the underlying biological mechanisms. However, current machine learning models designed to analyze such datasets still lack interpretability and theoretical guarantees, which are essential to biological applications. Recent advances in causal representation learning hav… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06518v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06518v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06518v1-abstract-full" style="display: none;"> Prevalent in biological applications (e.g., human phenotype measurements), multimodal datasets can provide valuable insights into the underlying biological mechanisms. However, current machine learning models designed to analyze such datasets still lack interpretability and theoretical guarantees, which are essential to biological applications. Recent advances in causal representation learning have shown promise in uncovering the interpretable latent causal variables with formal theoretical certificates. Unfortunately, existing works for multimodal distributions either rely on restrictive parametric assumptions or provide rather coarse identification results, limiting their applicability to biological research which favors a detailed understanding of the mechanisms. In this work, we aim to develop flexible identification conditions for multimodal data and principled methods to facilitate the understanding of biological datasets. Theoretically, we consider a flexible nonparametric latent distribution (c.f., parametric assumptions in prior work) permitting causal relationships across potentially different modalities. We establish identifiability guarantees for each latent component, extending the subspace identification results from prior work. Our key theoretical ingredient is the structural sparsity of the causal connections among distinct modalities, which, as we will discuss, is natural for a large collection of biological systems. Empirically, we propose a practical framework to instantiate our theoretical insights. We demonstrate the effectiveness of our approach through extensive experiments on both numerical and synthetic datasets. Results on a real-world human phenotype dataset are consistent with established medical research, validating our theoretical and methodological framework. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06518v1-abstract-full').style.display = 'none'; document.getElementById('2411.06518v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06363">arXiv:2411.06363</a> <span> [<a href="https://arxiv.org/pdf/2411.06363">pdf</a>, <a href="https://arxiv.org/format/2411.06363">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Layer-Wise Feature Metric of Semantic-Pixel Matching for Few-Shot Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tang%2C+H">Hao Tang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+J">Junhao Lu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+G">Guoheng Huang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Ming Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xuhang Chen</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+G">Guo Zhong</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+Z">Zhengguang Tan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zinuo Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06363v1-abstract-short" style="display: inline;"> In Few-Shot Learning (FSL), traditional metric-based approaches often rely on global metrics to compute similarity. However, in natural scenes, the spatial arrangement of key instances is often inconsistent across images. This spatial misalignment can result in mismatched semantic pixels, leading to inaccurate similarity measurements. To address this issue, we propose a novel method called the Lay… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06363v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06363v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06363v1-abstract-full" style="display: none;"> In Few-Shot Learning (FSL), traditional metric-based approaches often rely on global metrics to compute similarity. However, in natural scenes, the spatial arrangement of key instances is often inconsistent across images. This spatial misalignment can result in mismatched semantic pixels, leading to inaccurate similarity measurements. To address this issue, we propose a novel method called the Layer-Wise Features Metric of Semantic-Pixel Matching (LWFM-SPM) to make finer comparisons. Our method enhances model performance through two key modules: (1) the Layer-Wise Embedding (LWE) Module, which refines the cross-correlation of image pairs to generate well-focused feature maps for each layer; (2)the Semantic-Pixel Matching (SPM) Module, which aligns critical pixels based on semantic embeddings using an assignment algorithm. We conducted extensive experiments to evaluate our method on four widely used few-shot classification benchmarks: miniImageNet, tieredImageNet, CUB-200-2011, and CIFAR-FS. The results indicate that LWFM-SPM achieves competitive performance across these benchmarks. Our code will be publicly available on https://github.com/Halo2Tang/Code-for-LWFM-SPM. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06363v1-abstract-full').style.display = 'none'; document.getElementById('2411.06363v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06128">arXiv:2411.06128</a> <span> [<a href="https://arxiv.org/pdf/2411.06128">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Research on reinforcement learning based warehouse robot navigation algorithm in complex warehouse layout </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+K">Keqin Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Lipeng Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jiajing Chen</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+D">Dezhi Yu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+X">Xiaofan Zhou</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Ming Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Congyu Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhao Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06128v1-abstract-short" style="display: inline;"> In this paper, how to efficiently find the optimal path in complex warehouse layout and make real-time decision is a key problem. This paper proposes a new method of Proximal Policy Optimization (PPO) and Dijkstra's algorithm, Proximal policy-Dijkstra (PP-D). PP-D method realizes efficient strategy learning and real-time decision making through PPO, and uses Dijkstra algorithm to plan the global o… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06128v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06128v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06128v1-abstract-full" style="display: none;"> In this paper, how to efficiently find the optimal path in complex warehouse layout and make real-time decision is a key problem. This paper proposes a new method of Proximal Policy Optimization (PPO) and Dijkstra's algorithm, Proximal policy-Dijkstra (PP-D). PP-D method realizes efficient strategy learning and real-time decision making through PPO, and uses Dijkstra algorithm to plan the global optimal path, thus ensuring high navigation accuracy and significantly improving the efficiency of path planning. Specifically, PPO enables robots to quickly adapt and optimize action strategies in dynamic environments through its stable policy updating mechanism. Dijkstra's algorithm ensures global optimal path planning in static environment. Finally, through the comparison experiment and analysis of the proposed framework with the traditional algorithm, the results show that the PP-D method has significant advantages in improving the accuracy of navigation prediction and enhancing the robustness of the system. Especially in complex warehouse layout, PP-D method can find the optimal path more accurately and reduce collision and stagnation. This proves the reliability and effectiveness of the robot in the study of complex warehouse layout navigation algorithm. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06128v1-abstract-full').style.display = 'none'; document.getElementById('2411.06128v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06046">arXiv:2411.06046</a> <span> [<a href="https://arxiv.org/pdf/2411.06046">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Personalized News Recommendation System via LLM Embedding and Co-Occurrence Patterns </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zheng Li</a>, <a href="/search/cs?searchtype=author&query=Zhange%2C+K">Kai Zhange</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06046v1-abstract-short" style="display: inline;"> In the past two years, large language models (LLMs) have achieved rapid development and demonstrated remarkable emerging capabilities. Concurrently, with powerful semantic understanding and reasoning capabilities, LLMs have significantly empowered the rapid advancement of the recommendation system field. Specifically, in news recommendation (NR), systems must comprehend and process a vast amount o… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06046v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06046v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06046v1-abstract-full" style="display: none;"> In the past two years, large language models (LLMs) have achieved rapid development and demonstrated remarkable emerging capabilities. Concurrently, with powerful semantic understanding and reasoning capabilities, LLMs have significantly empowered the rapid advancement of the recommendation system field. Specifically, in news recommendation (NR), systems must comprehend and process a vast amount of clicked news text to infer the probability of candidate news clicks. This requirement exceeds the capabilities of traditional NR models but aligns well with the strengths of LLMs. In this paper, we propose a novel NR algorithm to reshape the news model via LLM Embedding and Co-Occurrence Pattern (LECOP). On one hand, we fintuned LLM by contrastive learning using large-scale datasets to encode news, which can fully explore the semantic information of news to thoroughly identify user preferences. On the other hand, we explored multiple co-occurrence patterns to mine collaborative information. Those patterns include news ID co-occurrence, Item-Item keywords co-occurrence and Intra-Item keywords co-occurrence. The keywords mentioned above are all generated by LLM. As far as we know, this is the first time that constructing such detailed Co-Occurrence Patterns via LLM to capture collaboration. Extensive experiments demonstrate the superior performance of our proposed novel method <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06046v1-abstract-full').style.display = 'none'; document.getElementById('2411.06046v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05881">arXiv:2411.05881</a> <span> [<a href="https://arxiv.org/pdf/2411.05881">pdf</a>, <a href="https://arxiv.org/format/2411.05881">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> MIPD: A Multi-sensory Interactive Perception Dataset for Embodied Intelligent Driving </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhiwei Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tingzhen Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+M">Meihua Zhou</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+D">Dandan Tang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+P">Pengwei Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+W">Wenzhuo Liu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Q">Qiaoning Yang</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+T">Tianyu Shen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+K">Kunfeng Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Huaping Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05881v1-abstract-short" style="display: inline;"> During the process of driving, humans usually rely on multiple senses to gather information and make decisions. Analogously, in order to achieve embodied intelligence in autonomous driving, it is essential to integrate multidimensional sensory information in order to facilitate interaction with the environment. However, the current multi-modal fusion sensing schemes often neglect these additional… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05881v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05881v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05881v1-abstract-full" style="display: none;"> During the process of driving, humans usually rely on multiple senses to gather information and make decisions. Analogously, in order to achieve embodied intelligence in autonomous driving, it is essential to integrate multidimensional sensory information in order to facilitate interaction with the environment. However, the current multi-modal fusion sensing schemes often neglect these additional sensory inputs, hindering the realization of fully autonomous driving. This paper considers multi-sensory information and proposes a multi-modal interactive perception dataset named MIPD, enabling expanding the current autonomous driving algorithm framework, for supporting the research on embodied intelligent driving. In addition to the conventional camera, lidar, and 4D radar data, our dataset incorporates multiple sensor inputs including sound, light intensity, vibration intensity and vehicle speed to enrich the dataset comprehensiveness. Comprising 126 consecutive sequences, many exceeding twenty seconds, MIPD features over 8,500 meticulously synchronized and annotated frames. Moreover, it encompasses many challenging scenarios, covering various road and lighting conditions. The dataset has undergone thorough experimental validation, producing valuable insights for the exploration of next-generation autonomous driving frameworks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05881v1-abstract-full').style.display = 'none'; document.getElementById('2411.05881v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Data, development kit and more details will be available at https://github.com/BUCT-IUSRC/Dataset MIPD</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05844">arXiv:2411.05844</a> <span> [<a href="https://arxiv.org/pdf/2411.05844">pdf</a>, <a href="https://arxiv.org/format/2411.05844">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> LEGO-GraphRAG: Modularizing Graph-based Retrieval-Augmented Generation for Design Space Exploration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cao%2C+Y">Yukun Cao</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Zengyi Gao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhiyang Li</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+X">Xike Xie</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+S+K">S Kevin Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05844v1-abstract-short" style="display: inline;"> GraphRAG addresses significant challenges in Retrieval-Augmented Generation (RAG) by leveraging graphs with embedded knowledge to enhance the reasoning capabilities of Large Language Models (LLMs). Despite its promising potential, the GraphRAG community currently lacks a unified framework for fine-grained decomposition of the graph-based knowledge retrieval process. Furthermore, there is no system… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05844v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05844v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05844v1-abstract-full" style="display: none;"> GraphRAG addresses significant challenges in Retrieval-Augmented Generation (RAG) by leveraging graphs with embedded knowledge to enhance the reasoning capabilities of Large Language Models (LLMs). Despite its promising potential, the GraphRAG community currently lacks a unified framework for fine-grained decomposition of the graph-based knowledge retrieval process. Furthermore, there is no systematic categorization or evaluation of existing solutions within the retrieval process. In this paper, we present LEGO-GraphRAG, a modular framework that decomposes the retrieval process of GraphRAG into three interconnected modules: subgraph-extraction, path-filtering, and path-refinement. We systematically summarize and classify the algorithms and neural network (NN) models relevant to each module, providing a clearer understanding of the design space for GraphRAG instances. Additionally, we identify key design factors, such as Graph Coupling and Computational Cost, that influence the effectiveness of GraphRAG implementations. Through extensive empirical studies, we construct high-quality GraphRAG instances using a representative selection of solutions and analyze their impact on retrieval and reasoning performance. Our findings offer critical insights into optimizing GraphRAG instance design, ultimately contributing to the advancement of more accurate and contextually relevant LLM applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05844v1-abstract-full').style.display = 'none'; document.getElementById('2411.05844v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05825">arXiv:2411.05825</a> <span> [<a href="https://arxiv.org/pdf/2411.05825">pdf</a>, <a href="https://arxiv.org/format/2411.05825">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Neurons and Cognition">q-bio.NC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SurfGNN: A robust surface-based prediction model with interpretability for coactivation maps of spatial and cortical features </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhuoshuo Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiong Zhang</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+Y">Youbing Zeng</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+J">Jiaying Lin</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+D">Dan Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jianjia Zhang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+D">Duan Xu</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+H">Hosung Kim</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+B">Bingguang Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+M">Mengting Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05825v1-abstract-short" style="display: inline;"> Current brain surface-based prediction models often overlook the variability of regional attributes at the cortical feature level. While graph neural networks (GNNs) excel at capturing regional differences, they encounter challenges when dealing with complex, high-density graph structures. In this work, we consider the cortical surface mesh as a sparse graph and propose an interpretable prediction… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05825v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05825v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05825v1-abstract-full" style="display: none;"> Current brain surface-based prediction models often overlook the variability of regional attributes at the cortical feature level. While graph neural networks (GNNs) excel at capturing regional differences, they encounter challenges when dealing with complex, high-density graph structures. In this work, we consider the cortical surface mesh as a sparse graph and propose an interpretable prediction model-Surface Graph Neural Network (SurfGNN). SurfGNN employs topology-sampling learning (TSL) and region-specific learning (RSL) structures to manage individual cortical features at both lower and higher scales of the surface mesh, effectively tackling the challenges posed by the overly abundant mesh nodes and addressing the issue of heterogeneity in cortical regions. Building on this, a novel score-weighted fusion (SWF) method is implemented to merge nodal representations associated with each cortical feature for prediction. We apply our model to a neonatal brain age prediction task using a dataset of harmonized MR images from 481 subjects (503 scans). SurfGNN outperforms all existing state-of-the-art methods, demonstrating an improvement of at least 9.0% and achieving a mean absolute error (MAE) of 0.827+0.056 in postmenstrual weeks. Furthermore, it generates feature-level activation maps, indicating its capability to identify robust regional variations in different morphometric contributions for prediction. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05825v1-abstract-full').style.display = 'none'; document.getElementById('2411.05825v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages, 6 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> J.3 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05348">arXiv:2411.05348</a> <span> [<a href="https://arxiv.org/pdf/2411.05348">pdf</a>, <a href="https://arxiv.org/format/2411.05348">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> LLM-PySC2: Starcraft II learning environment for Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zongyuan Li</a>, <a href="/search/cs?searchtype=author&query=Ni%2C+Y">Yanan Ni</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+R">Runnan Qi</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+L">Lumin Jiang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+C">Chang Lu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xiaojie Xu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiangbei Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+P">Pengfei Li</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Y">Yunzheng Guo</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Z">Zhe Ma</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+X">Xian Guo</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+K">Kuihua Huang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xuebo Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05348v1-abstract-short" style="display: inline;"> This paper introduces a new environment LLM-PySC2 (the Large Language Model StarCraft II Learning Environment), a platform derived from DeepMind's StarCraft II Learning Environment that serves to develop Large Language Models (LLMs) based decision-making methodologies. This environment is the first to offer the complete StarCraft II action space, multi-modal observation interfaces, and a structure… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05348v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05348v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05348v1-abstract-full" style="display: none;"> This paper introduces a new environment LLM-PySC2 (the Large Language Model StarCraft II Learning Environment), a platform derived from DeepMind's StarCraft II Learning Environment that serves to develop Large Language Models (LLMs) based decision-making methodologies. This environment is the first to offer the complete StarCraft II action space, multi-modal observation interfaces, and a structured game knowledge database, which are seamlessly connected with various LLMs to facilitate the research of LLMs-based decision-making. To further support multi-agent research, we developed an LLM collaborative framework that supports multi-agent concurrent queries and multi-agent communication. In our experiments, the LLM-PySC2 environment is adapted to be compatible with the StarCraft Multi-Agent Challenge (SMAC) task group and provided eight new scenarios focused on macro-decision abilities. We evaluated nine mainstream LLMs in the experiments, and results show that sufficient parameters are necessary for LLMs to make decisions, but improving reasoning ability does not directly lead to better decision-making outcomes. Our findings further indicate the importance of enabling large models to learn autonomously in the deployment environment through parameter training or train-free learning techniques. Ultimately, we expect that the LLM-PySC2 environment can promote research on learning methods for LLMs, helping LLM-based methods better adapt to task scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05348v1-abstract-full').style.display = 'none'; document.getElementById('2411.05348v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05261">arXiv:2411.05261</a> <span> [<a href="https://arxiv.org/pdf/2411.05261">pdf</a>, <a href="https://arxiv.org/format/2411.05261">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Decoding Report Generators: A Cyclic Vision-Language Adapter for Counterfactual Explanations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fang%2C+Y">Yingying Fang</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+Z">Zihao Jin</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+S">Shaojie Guo</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jinda Liu</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yijian Gao</a>, <a href="/search/cs?searchtype=author&query=Ning%2C+J">Junzhi Ning</a>, <a href="/search/cs?searchtype=author&query=Yue%2C+Z">Zhiling Yue</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhi Li</a>, <a href="/search/cs?searchtype=author&query=Walsh%2C+S+L">Simon LF Walsh</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+G">Guang Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05261v1-abstract-short" style="display: inline;"> Despite significant advancements in report generation methods, a critical limitation remains: the lack of interpretability in the generated text. This paper introduces an innovative approach to enhance the explainability of text generated by report generation models. Our method employs cyclic text manipulation and visual comparison to identify and elucidate the features in the original content tha… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05261v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05261v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05261v1-abstract-full" style="display: none;"> Despite significant advancements in report generation methods, a critical limitation remains: the lack of interpretability in the generated text. This paper introduces an innovative approach to enhance the explainability of text generated by report generation models. Our method employs cyclic text manipulation and visual comparison to identify and elucidate the features in the original content that influence the generated text. By manipulating the generated reports and producing corresponding images, we create a comparative framework that highlights key attributes and their impact on the text generation process. This approach not only identifies the image features aligned to the generated text but also improves transparency but also provides deeper insights into the decision-making mechanisms of the report generation models. Our findings demonstrate the potential of this method to significantly enhance the interpretability and transparency of AI-generated reports. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05261v1-abstract-full').style.display = 'none'; document.getElementById('2411.05261v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05185">arXiv:2411.05185</a> <span> [<a href="https://arxiv.org/pdf/2411.05185">pdf</a>, <a href="https://arxiv.org/format/2411.05185">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> PentestAgent: Incorporating LLM Agents to Automated Penetration Testing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shen%2C+X">Xiangmin Shen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Lingzhi Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhenyuan Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yan Chen</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+W">Wencheng Zhao</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+D">Dawei Sun</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jiashui Wang</a>, <a href="/search/cs?searchtype=author&query=Ruan%2C+W">Wei Ruan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05185v1-abstract-short" style="display: inline;"> Penetration testing is a critical technique for identifying security vulnerabilities, traditionally performed manually by skilled security specialists. This complex process involves gathering information about the target system, identifying entry points, exploiting the system, and reporting findings. Despite its effectiveness, manual penetration testing is time-consuming and expensive, often requi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05185v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05185v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05185v1-abstract-full" style="display: none;"> Penetration testing is a critical technique for identifying security vulnerabilities, traditionally performed manually by skilled security specialists. This complex process involves gathering information about the target system, identifying entry points, exploiting the system, and reporting findings. Despite its effectiveness, manual penetration testing is time-consuming and expensive, often requiring significant expertise and resources that many organizations cannot afford. While automated penetration testing methods have been proposed, they often fall short in real-world applications due to limitations in flexibility, adaptability, and implementation. Recent advancements in large language models (LLMs) offer new opportunities for enhancing penetration testing through increased intelligence and automation. However, current LLM-based approaches still face significant challenges, including limited penetration testing knowledge and a lack of comprehensive automation capabilities. To address these gaps, we propose PentestAgent, a novel LLM-based automated penetration testing framework that leverages the power of LLMs and various LLM-based techniques like Retrieval Augmented Generation (RAG) to enhance penetration testing knowledge and automate various tasks. Our framework leverages multi-agent collaboration to automate intelligence gathering, vulnerability analysis, and exploitation stages, reducing manual intervention. We evaluate PentestAgent using a comprehensive benchmark, demonstrating superior performance in task completion and overall efficiency. This work significantly advances the practical applicability of automated penetration testing systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05185v1-abstract-full').style.display = 'none'; document.getElementById('2411.05185v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">14 pages, 13 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04898">arXiv:2411.04898</a> <span> [<a href="https://arxiv.org/pdf/2411.04898">pdf</a>, <a href="https://arxiv.org/format/2411.04898">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantum Physics">quant-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Strongly Correlated Electrons">cond-mat.str-el</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computational Complexity">cs.CC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Mathematical Physics">math-ph</span> </div> </div> <p class="title is-5 mathjax"> Convergence efficiency of quantum gates and circuits </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kong%2C+L">Linghang Kong</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zimu Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zi-Wen Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04898v1-abstract-short" style="display: inline;"> We consider quantum circuit models where the gates are drawn from arbitrary gate ensembles given by probabilistic distributions over certain gate sets and circuit architectures, which we call stochastic quantum circuits. Of main interest in this work is the speed of convergence of stochastic circuits with different gate ensembles and circuit architectures to unitary t-designs. A key motivation for… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04898v1-abstract-full').style.display = 'inline'; document.getElementById('2411.04898v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04898v1-abstract-full" style="display: none;"> We consider quantum circuit models where the gates are drawn from arbitrary gate ensembles given by probabilistic distributions over certain gate sets and circuit architectures, which we call stochastic quantum circuits. Of main interest in this work is the speed of convergence of stochastic circuits with different gate ensembles and circuit architectures to unitary t-designs. A key motivation for this theory is the varying preference for different gates and circuit architectures in different practical scenarios. In particular, it provides a versatile framework for devising efficient circuits for implementing $t$-designs and relevant applications including random circuit and scrambling experiments, as well as benchmarking the performance of gates and circuit architectures. We examine various important settings in depth. A key aspect of our study is an "ironed gadget" model, which allows us to systematically evaluate and compare the convergence efficiency of entangling gates and circuit architectures. Particularly notable results include i) gadgets of two-qubit gates with KAK coefficients $\left(\frac蟺{4}-\frac{1}{8}\arccos(\frac{1}{5}),\frac蟺{8},\frac{1}{8}\arccos(\frac{1}{5})\right)$ (which we call $蠂$ gates) directly form exact 2- and 3-designs; ii) the iSWAP gate family achieves the best efficiency for convergence to 2-designs under mild conjectures with numerical evidence, even outperforming the Haar-random gate, for generic many-body circuits; iii) iSWAP + complete graph achieve the best efficiency for convergence to 2-designs among all graph circuits. A variety of numerical results are provided to complement our analysis. We also derive robustness guarantees for our analysis against gate perturbations. Additionally, we provide cursory analysis on gates with higher locality and found that the Margolus gate outperforms various other well-known gates. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04898v1-abstract-full').style.display = 'none'; document.getElementById('2411.04898v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">50 pages + 8 tables + 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04893">arXiv:2411.04893</a> <span> [<a href="https://arxiv.org/pdf/2411.04893">pdf</a>, <a href="https://arxiv.org/ps/2411.04893">ps</a>, <a href="https://arxiv.org/format/2411.04893">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantum Physics">quant-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Statistical Mechanics">cond-mat.stat-mech</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Mathematical Physics">math-ph</span> </div> </div> <p class="title is-5 mathjax"> Efficient quantum pseudorandomness under conservation laws </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zimu Li</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+H">Han Zheng</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zi-Wen Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04893v1-abstract-short" style="display: inline;"> The efficiency of locally generating unitary designs, which capture statistical notions of quantum pseudorandomness, lies at the heart of wide-ranging areas in physics and quantum information technologies. While there are extensive potent methods and results for this problem, the evidently important setting where continuous symmetries or conservation laws (most notably U(1) and SU(d)) are involved… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04893v1-abstract-full').style.display = 'inline'; document.getElementById('2411.04893v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04893v1-abstract-full" style="display: none;"> The efficiency of locally generating unitary designs, which capture statistical notions of quantum pseudorandomness, lies at the heart of wide-ranging areas in physics and quantum information technologies. While there are extensive potent methods and results for this problem, the evidently important setting where continuous symmetries or conservation laws (most notably U(1) and SU(d)) are involved is known to present fundamental difficulties. In particular, even the basic question of whether any local symmetric circuit can generate 2-designs efficiently (in time that grows at most polynomially in the system size) remains open with no circuit constructions provably known to do so, despite intensive efforts. In this work, we resolve this long-standing open problem for both U(1) and SU(d) symmetries by explicitly constructing local symmetric quantum circuits which we prove to converge to symmetric unitary 2-designs in polynomial time using a combination of representation theory, graph theory, and Markov chain methods. As a direct application, our constructions can be used to efficiently generate near-optimal random covariant quantum error-correcting codes, confirming a conjecture in [PRX Quantum 3, 020314 (2022)]. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04893v1-abstract-full').style.display = 'none'; document.getElementById('2411.04893v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 + 48 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04794">arXiv:2411.04794</a> <span> [<a href="https://arxiv.org/pdf/2411.04794">pdf</a>, <a href="https://arxiv.org/format/2411.04794">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> AlignXIE: Improving Multilingual Information Extraction by Cross-Lingual Alignment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zuo%2C+Y">Yuxin Zuo</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+W">Wenxuan Jiang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+W">Wenxuan Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zixuan Li</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+L">Long Bai</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hanbin Wang</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+Y">Yutao Zeng</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+X">Xiaolong Jin</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+J">Jiafeng Guo</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+X">Xueqi Cheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04794v1-abstract-short" style="display: inline;"> Empirical evidence suggests that LLMs exhibit spontaneous cross-lingual alignment. Our findings suggest that although LLMs also demonstrate promising cross-lingual alignment in Information Extraction, there remains significant imbalance across languages, revealing an underlying deficiency in the IE alignment. To address this issue, we propose AlignXIE, a powerful code-based LLM that significantly… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04794v1-abstract-full').style.display = 'inline'; document.getElementById('2411.04794v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04794v1-abstract-full" style="display: none;"> Empirical evidence suggests that LLMs exhibit spontaneous cross-lingual alignment. Our findings suggest that although LLMs also demonstrate promising cross-lingual alignment in Information Extraction, there remains significant imbalance across languages, revealing an underlying deficiency in the IE alignment. To address this issue, we propose AlignXIE, a powerful code-based LLM that significantly enhances cross-lingual IE alignment through two strategies. Firstly, AlignXIE formulates IE across different languages, especially non-English ones, as code generation tasks, standardizing the representation of various schemas using Python classes to ensure consistency of the same ontology in different languages and align the schema. Secondly, it incorporates an IE cross-lingual alignment phase through a translated instance prediction task proposed in this paper to align the extraction process, utilizing ParallelNER, an IE bilingual parallel dataset with 257,190 samples, generated by our proposed LLM-based automatic pipeline for IE parallel data construction, with manual annotation to ensure quality. Ultimately, we obtain AlignXIE through multilingual IE instruction tuning. Although without training in 9 unseen languages, AlignXIE surpasses ChatGPT by $30.17\%$ and SoTA by $20.03\%$, thereby demonstrating superior cross-lingual IE capabilities. Comprehensive evaluations on 63 IE benchmarks in Chinese and English under various settings, demonstrate that AlignXIE significantly enhances cross-lingual and multilingual IE through boosting the IE alignment. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04794v1-abstract-full').style.display = 'none'; document.getElementById('2411.04794v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Work in progress</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04693">arXiv:2411.04693</a> <span> [<a href="https://arxiv.org/pdf/2411.04693">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Reciprocal Point Learning Network with Large Electromagnetic Kernel for SAR Open-Set Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiao%2C+X">Xiayang Xiao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhuoxuan Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Ruyi Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jiacheng Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haipeng Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04693v1-abstract-short" style="display: inline;"> The limitations of existing Synthetic Aperture Radar (SAR) Automatic Target Recognition (ATR) methods lie in their confinement by the closed-environment assumption, hindering their effective and robust handling of unknown target categories in open environments. Open Set Recognition (OSR), a pivotal facet for algorithmic practicality, intends to categorize known classes while denoting unknown ones… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04693v1-abstract-full').style.display = 'inline'; document.getElementById('2411.04693v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04693v1-abstract-full" style="display: none;"> The limitations of existing Synthetic Aperture Radar (SAR) Automatic Target Recognition (ATR) methods lie in their confinement by the closed-environment assumption, hindering their effective and robust handling of unknown target categories in open environments. Open Set Recognition (OSR), a pivotal facet for algorithmic practicality, intends to categorize known classes while denoting unknown ones as "unknown." The chief challenge in OSR involves concurrently mitigating risks associated with generalizing features from a restricted set of known classes to numerous unknown samples and the open space exposure to potential unknown data. To enhance open-set SAR classification, a method called scattering kernel with reciprocal learning network is proposed. Initially, a feature learning framework is constructed based on reciprocal point learning (RPL), establishing a bounded space for potential unknown classes. This approach indirectly introduces unknown information into a learner confined to known classes, thereby acquiring more concise and discriminative representations. Subsequently, considering the variability in the imaging of targets at different angles and the discreteness of components in SAR images, a proposal is made to design convolutional kernels based on large-sized attribute scattering center models. This enhances the ability to extract intrinsic non-linear features and specific scattering characteristics in SAR images, thereby improving the discriminative features of the model and mitigating the impact of imaging variations on classification performance. Experiments on the MSTAR datasets substantiate the superior performance of the proposed approach called ASC-RPL over mainstream methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04693v1-abstract-full').style.display = 'none'; document.getElementById('2411.04693v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04494">arXiv:2411.04494</a> <span> [<a href="https://arxiv.org/pdf/2411.04494">pdf</a>, <a href="https://arxiv.org/format/2411.04494">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Online Omnidirectional Jumping Trajectory Planning for Quadrupedal Robots on Uneven Terrains </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yue%2C+L">Linzhu Yue</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Z">Zhitao Song</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+J">Jinhu Dong</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhongyu Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hongbo Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Lingwei Zhang</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+X">Xuanqi Zeng</a>, <a href="/search/cs?searchtype=author&query=Sreenath%2C+K">Koushil Sreenath</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yun-hui Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04494v2-abstract-short" style="display: inline;"> Natural terrain complexity often necessitates agile movements like jumping in animals to improve traversal efficiency. To enable similar capabilities in quadruped robots, complex real-time jumping maneuvers are required. Current research does not adequately address the problem of online omnidirectional jumping and neglects the robot's kinodynamic constraints during trajectory generation. This pape… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04494v2-abstract-full').style.display = 'inline'; document.getElementById('2411.04494v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04494v2-abstract-full" style="display: none;"> Natural terrain complexity often necessitates agile movements like jumping in animals to improve traversal efficiency. To enable similar capabilities in quadruped robots, complex real-time jumping maneuvers are required. Current research does not adequately address the problem of online omnidirectional jumping and neglects the robot's kinodynamic constraints during trajectory generation. This paper proposes a general and complete cascade online optimization framework for omnidirectional jumping for quadruped robots. Our solution systematically encompasses jumping trajectory generation, a trajectory tracking controller, and a landing controller. It also incorporates environmental perception to navigate obstacles that standard locomotion cannot bypass, such as jumping from high platforms. We introduce a novel jumping plane to parameterize omnidirectional jumping motion and formulate a tightly coupled optimization problem accounting for the kinodynamic constraints, simultaneously optimizing CoM trajectory, Ground Reaction Forces (GRFs), and joint states. To meet the online requirements, we propose an accelerated evolutionary algorithm as the trajectory optimizer to address the complexity of kinodynamic constraints. To ensure stability and accuracy in environmental perception post-landing, we introduce a coarse-to-fine relocalization method that combines global Branch and Bound (BnB) search with Maximum a Posteriori (MAP) estimation for precise positioning during navigation and jumping. The proposed framework achieves jump trajectory generation in approximately 0.1 seconds with a warm start and has been successfully validated on two quadruped robots on uneven terrains. Additionally, we extend the framework's versatility to humanoid robots. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04494v2-abstract-full').style.display = 'none'; document.getElementById('2411.04494v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to IJRR</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04335">arXiv:2411.04335</a> <span> [<a href="https://arxiv.org/pdf/2411.04335">pdf</a>, <a href="https://arxiv.org/format/2411.04335">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> GazeGen: Gaze-Driven User Interaction for Visual Content Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hsieh%2C+H">He-Yen Hsieh</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Ziyun Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S+Q">Sai Qian Zhang</a>, <a href="/search/cs?searchtype=author&query=Ting%2C+W+M">Wei-Te Mark Ting</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+K">Kao-Den Chang</a>, <a href="/search/cs?searchtype=author&query=De+Salvo%2C+B">Barbara De Salvo</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+C">Chiao Liu</a>, <a href="/search/cs?searchtype=author&query=Kung%2C+H+T">H. T. Kung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04335v2-abstract-short" style="display: inline;"> We present GazeGen, a user interaction system that generates visual content (images and videos) for locations indicated by the user's eye gaze. GazeGen allows intuitive manipulation of visual content by targeting regions of interest with gaze. Using advanced techniques in object detection and generative AI, GazeGen performs gaze-controlled image adding/deleting, repositioning, and surface style ch… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04335v2-abstract-full').style.display = 'inline'; document.getElementById('2411.04335v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04335v2-abstract-full" style="display: none;"> We present GazeGen, a user interaction system that generates visual content (images and videos) for locations indicated by the user's eye gaze. GazeGen allows intuitive manipulation of visual content by targeting regions of interest with gaze. Using advanced techniques in object detection and generative AI, GazeGen performs gaze-controlled image adding/deleting, repositioning, and surface style changes of image objects, and converts static images into videos. Central to GazeGen is the DFT Gaze (Distilled and Fine-Tuned Gaze) agent, an ultra-lightweight model with only 281K parameters, performing accurate real-time gaze predictions tailored to individual users' eyes on small edge devices. GazeGen is the first system to combine visual content generation with real-time gaze estimation, made possible exclusively by DFT Gaze. This real-time gaze estimation enables various visual content generation tasks, all controlled by the user's gaze. The input for DFT Gaze is the user's eye images, while the inputs for visual content generation are the user's view and the predicted gaze point from DFT Gaze. To achieve efficient gaze predictions, we derive the small model from a large model (10x larger) via novel knowledge distillation and personal adaptation techniques. We integrate knowledge distillation with a masked autoencoder, developing a compact yet powerful gaze estimation model. This model is further fine-tuned with Adapters, enabling highly accurate and personalized gaze predictions with minimal user input. DFT Gaze ensures low-latency and precise gaze tracking, supporting a wide range of gaze-driven tasks. We validate the performance of DFT Gaze on AEA and OpenEDS2020 benchmarks, demonstrating low angular gaze error and low latency on the edge device (Raspberry Pi 4). Furthermore, we describe applications of GazeGen, illustrating its versatility and effectiveness in various usage scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04335v2-abstract-full').style.display = 'none'; document.getElementById('2411.04335v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 10 figures</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Li%2C+Z&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Li%2C+Z&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Li%2C+Z&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Li%2C+Z&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Li%2C+Z&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Li%2C+Z&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>