Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 765 results for author: <span class="mathjax">Gao, Z</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Gao%2C+Z">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Gao, Z"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Gao%2C+Z&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Gao, Z"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Gao%2C+Z&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Gao%2C+Z&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Gao%2C+Z&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Gao%2C+Z&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Gao%2C+Z&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Gao%2C+Z&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.13838">arXiv:2502.13838</a> <span> [<a href="https://arxiv.org/pdf/2502.13838">pdf</a>, <a href="https://arxiv.org/format/2502.13838">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Generative Video Semantic Communication via Multimodal Semantic Fusion with Large Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yin%2C+H">Hang Yin</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+L">Li Qiao</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Y">Yu Ma</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+S">Shuo Sun</a>, <a href="/search/cs?searchtype=author&query=Li%2C+K">Kan Li</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Zhen Gao</a>, <a href="/search/cs?searchtype=author&query=Niyato%2C+D">Dusit Niyato</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.13838v1-abstract-short" style="display: inline;"> Despite significant advancements in traditional syntactic communications based on Shannon's theory, these methods struggle to meet the requirements of 6G immersive communications, especially under challenging transmission conditions. With the development of generative artificial intelligence (GenAI), progress has been made in reconstructing videos using high-level semantic information. In this pap… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13838v1-abstract-full').style.display = 'inline'; document.getElementById('2502.13838v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.13838v1-abstract-full" style="display: none;"> Despite significant advancements in traditional syntactic communications based on Shannon's theory, these methods struggle to meet the requirements of 6G immersive communications, especially under challenging transmission conditions. With the development of generative artificial intelligence (GenAI), progress has been made in reconstructing videos using high-level semantic information. In this paper, we propose a scalable generative video semantic communication framework that extracts and transmits semantic information to achieve high-quality video reconstruction. Specifically, at the transmitter, description and other condition signals (e.g., first frame, sketches, etc.) are extracted from the source video, functioning as text and structural semantics, respectively. At the receiver, the diffusion-based GenAI large models are utilized to fuse the semantics of the multiple modalities for reconstructing the video. Simulation results demonstrate that, at an ultra-low channel bandwidth ratio (CBR), our scheme effectively captures semantic information to reconstruct videos aligned with human perception under different signal-to-noise ratios. Notably, the proposed ``First Frame+Desc." scheme consistently achieves CLIP score exceeding 0.92 at CBR = 0.0057 for SNR > 0 dB. This demonstrates its robust performance even under low SNR conditions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13838v1-abstract-full').style.display = 'none'; document.getElementById('2502.13838v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.12096">arXiv:2502.12096</a> <span> [<a href="https://arxiv.org/pdf/2502.12096">pdf</a>, <a href="https://arxiv.org/format/2502.12096">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Token Communications: A Unified Framework for Cross-modal Context-aware Semantic Communications </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qiao%2C+L">Li Qiao</a>, <a href="/search/cs?searchtype=author&query=Mashhadi%2C+M+B">Mahdi Boloursaz Mashhadi</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Zhen Gao</a>, <a href="/search/cs?searchtype=author&query=Tafazolli%2C+R">Rahim Tafazolli</a>, <a href="/search/cs?searchtype=author&query=Bennis%2C+M">Mehdi Bennis</a>, <a href="/search/cs?searchtype=author&query=Niyato%2C+D">Dusit Niyato</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.12096v1-abstract-short" style="display: inline;"> In this paper, we introduce token communications (TokCom), a unified framework to leverage cross-modal context information in generative semantic communications (GenSC). TokCom is a new paradigm, motivated by the recent success of generative foundation models and multimodal large language models (GFM/MLLMs), where the communication units are tokens, enabling efficient transformer-based token proce… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12096v1-abstract-full').style.display = 'inline'; document.getElementById('2502.12096v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.12096v1-abstract-full" style="display: none;"> In this paper, we introduce token communications (TokCom), a unified framework to leverage cross-modal context information in generative semantic communications (GenSC). TokCom is a new paradigm, motivated by the recent success of generative foundation models and multimodal large language models (GFM/MLLMs), where the communication units are tokens, enabling efficient transformer-based token processing at the transmitter and receiver. In this paper, we introduce the potential opportunities and challenges of leveraging context in GenSC, explore how to integrate GFM/MLLMs-based token processing into semantic communication systems to leverage cross-modal context effectively, present the key principles for efficient TokCom at various layers in future wireless networks. We demonstrate the corresponding TokCom benefits in a GenSC setup for image, leveraging cross-modal context information, which increases the bandwidth efficiency by 70.8% with negligible loss of semantic/perceptual quality. Finally, the potential research directions are identified to facilitate adoption of TokCom in future wireless networks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12096v1-abstract-full').style.display = 'none'; document.getElementById('2502.12096v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07384">arXiv:2502.07384</a> <span> [<a href="https://arxiv.org/pdf/2502.07384">pdf</a>, <a href="https://arxiv.org/format/2502.07384">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computational Engineering, Finance, and Science">cs.CE</span> </div> </div> <p class="title is-5 mathjax"> SAGEPhos: Sage Bio-Coupled and Augmented Fusion for Phosphorylation Site Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jingjie Zhang</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+H">Hanqun Cao</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Zijun Gao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiaorui Wang</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+C">Chunbin Gu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07384v1-abstract-short" style="display: inline;"> Phosphorylation site prediction based on kinase-substrate interaction plays a vital role in understanding cellular signaling pathways and disease mechanisms. Computational methods for this task can be categorized into kinase-family-focused and individual kinase-targeted approaches. Individual kinase-targeted methods have gained prominence for their ability to explore a broader protein space and pr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07384v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07384v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07384v1-abstract-full" style="display: none;"> Phosphorylation site prediction based on kinase-substrate interaction plays a vital role in understanding cellular signaling pathways and disease mechanisms. Computational methods for this task can be categorized into kinase-family-focused and individual kinase-targeted approaches. Individual kinase-targeted methods have gained prominence for their ability to explore a broader protein space and provide more precise target information for kinase inhibitors. However, most existing individual kinase-based approaches focus solely on sequence inputs, neglecting crucial structural information. To address this limitation, we introduce SAGEPhos (Structure-aware kinAse-substrate bio-coupled and bio-auGmented nEtwork for Phosphorylation site prediction), a novel framework that modifies the semantic space of main protein inputs using auxiliary inputs at two distinct modality levels. At the inter-modality level, SAGEPhos introduces a Bio-Coupled Modal Fusion method, distilling essential kinase sequence information to refine task-oriented local substrate feature space, creating a shared semantic space that captures crucial kinase-substrate interaction patterns. Within the substrate's intra-modality domain, it focuses on Bio-Augmented Fusion, emphasizing 2D local sequence information while selectively incorporating 3D spatial information from predicted structures to complement the sequence space. Moreover, to address the lack of structural information in current datasets, we contribute a new, refined phosphorylation site prediction dataset, which incorporates crucial structural elements and will serve as a new benchmark for the field. Experimental results demonstrate that SAGEPhos significantly outperforms baseline methods. We release the SAGEPhos models and code at https://github.com/ZhangJJ26/SAGEPhos. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07384v1-abstract-full').style.display = 'none'; document.getElementById('2502.07384v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Extended from ICLR2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06913">arXiv:2502.06913</a> <span> [<a href="https://arxiv.org/pdf/2502.06913">pdf</a>, <a href="https://arxiv.org/format/2502.06913">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantitative Methods">q-bio.QM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> A Simple yet Effective DDG Predictor is An Unsupervised Antibody Optimizer and Explainer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+L">Lirong Wu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yunfan Liu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H">Haitao Lin</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yufei Huang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+G">Guojiang Zhao</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Zhifeng Gao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S+Z">Stan Z. Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06913v2-abstract-short" style="display: inline;"> The proteins that exist today have been optimized over billions of years of natural evolution, during which nature creates random mutations and selects them. The discovery of functionally promising mutations is challenged by the limited evolutionary accessible regions, i.e., only a small region on the fitness landscape is beneficial. There have been numerous priors used to constrain protein evolut… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06913v2-abstract-full').style.display = 'inline'; document.getElementById('2502.06913v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06913v2-abstract-full" style="display: none;"> The proteins that exist today have been optimized over billions of years of natural evolution, during which nature creates random mutations and selects them. The discovery of functionally promising mutations is challenged by the limited evolutionary accessible regions, i.e., only a small region on the fitness landscape is beneficial. There have been numerous priors used to constrain protein evolution to regions of landscapes with high-fitness variants, among which the change in binding free energy (DDG) of protein complexes upon mutations is one of the most commonly used priors. However, the huge mutation space poses two challenges: (1) how to improve the efficiency of DDG prediction for fast mutation screening; and (2) how to explain mutation preferences and efficiently explore accessible evolutionary regions. To address these challenges, we propose a lightweight DDG predictor (Light-DDG), which adopts a structure-aware Transformer as the backbone and enhances it by knowledge distilled from existing powerful but computationally heavy DDG predictors. Additionally, we augmented, annotated, and released a large-scale dataset containing millions of mutation data for pre-training Light-DDG. We find that such a simple yet effective Light-DDG can serve as a good unsupervised antibody optimizer and explainer. For the target antibody, we propose a novel Mutation Explainer to learn mutation preferences, which accounts for the marginal benefit of each mutation per residue. To further explore accessible evolutionary regions, we conduct preference-guided antibody optimization and evaluate antibody candidates quickly using Light-DDG to identify desirable mutations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06913v2-abstract-full').style.display = 'none'; document.getElementById('2502.06913v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06239">arXiv:2502.06239</a> <span> [<a href="https://arxiv.org/pdf/2502.06239">pdf</a>, <a href="https://arxiv.org/ps/2502.06239">ps</a>, <a href="https://arxiv.org/format/2502.06239">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> </div> </div> <p class="title is-5 mathjax"> Pre-Equalization Aided Grant-Free Massive Access in Massive MIMO System </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yueqing Wang</a>, <a href="/search/cs?searchtype=author&query=Mei%2C+Y">Yikun Mei</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Zhen Gao</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+Z">Ziwei Wan</a>, <a href="/search/cs?searchtype=author&query=Ning%2C+B">Boyu Ning</a>, <a href="/search/cs?searchtype=author&query=Mi%2C+D">De Mi</a>, <a href="/search/cs?searchtype=author&query=Muhaidat%2C+S">Sami Muhaidat</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06239v2-abstract-short" style="display: inline;"> The spatial diversity and multiplexing advantages of massive multi-input-multi-output (mMIMO) can significantly improve the capacity of massive non-orthogonal multiple access (NOMA) in machine type communications. However, state-of-the-art grant-free massive NOMA schemes for mMIMO systems require accurate estimation of random access channels to perform activity detection and the following coherent… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06239v2-abstract-full').style.display = 'inline'; document.getElementById('2502.06239v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06239v2-abstract-full" style="display: none;"> The spatial diversity and multiplexing advantages of massive multi-input-multi-output (mMIMO) can significantly improve the capacity of massive non-orthogonal multiple access (NOMA) in machine type communications. However, state-of-the-art grant-free massive NOMA schemes for mMIMO systems require accurate estimation of random access channels to perform activity detection and the following coherent data demodulation, which suffers from excessive pilot overhead and access latency. To address this, we propose a pre-equalization aided grant-free massive access scheme for mMIMO systems, where an iterative detection scheme is conceived. Specifically, the base station (BS) firstly activates one of its antennas (i.e., beacon antenna) to broadcast a beacon signal, which facilitates the user equipment (UEs) to perform downlink channel estimation and pre-equalize the uplink random access signal with respect to the channels associated with the beacon antenna. During the uplink transmission stage, the BS detects UEs' activity and data by using the proposed iterative detection algorithm, which consists of three modules: coarse data detection (DD), data-aided channel estimation (CE), and fine DD. In the proposed algorithm, the joint activity and DD is firstly performed based on the signals received by the beacon antenna. Subsequently, the DD is further refined by iteratively performing data-aided CE module and fine DD module using signals received by all BS antennas. Our simulation results demonstrate that the proposed scheme outperforms state-of-the-art mMIMO-based grant-free massive NOMA schemes with the same access latency. Simulation codes are provided to reproduce the results in this article: https://github.com/owenwang517/tvt-2025. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06239v2-abstract-full').style.display = 'none'; document.getElementById('2502.06239v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted for publication as a Correspondence in the IEEE Transactions on Vehicular Technology</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06118">arXiv:2502.06118</a> <span> [<a href="https://arxiv.org/pdf/2502.06118">pdf</a>, <a href="https://arxiv.org/format/2502.06118">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Token-Domain Multiple Access: Exploiting Semantic Orthogonality for Collision Mitigation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qiao%2C+L">Li Qiao</a>, <a href="/search/cs?searchtype=author&query=Mashhadi%2C+M+B">Mahdi Boloursaz Mashhadi</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Zhen Gao</a>, <a href="/search/cs?searchtype=author&query=G%C3%BCnd%C3%BCz%2C+D">Deniz G眉nd眉z</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06118v1-abstract-short" style="display: inline;"> Token communications is an emerging generative semantic communication concept that reduces transmission rates by using context and transformer-based token processing, with tokens serving as universal semantic units. In this paper, we propose a semantic multiple access scheme in the token domain, referred to as ToDMA, where a large number of devices share a tokenizer and a modulation codebook for s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06118v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06118v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06118v1-abstract-full" style="display: none;"> Token communications is an emerging generative semantic communication concept that reduces transmission rates by using context and transformer-based token processing, with tokens serving as universal semantic units. In this paper, we propose a semantic multiple access scheme in the token domain, referred to as ToDMA, where a large number of devices share a tokenizer and a modulation codebook for source and channel coding, respectively. Specifically, the source signal is tokenized into sequences, with each token modulated into a codeword. Codewords from multiple devices are transmitted simultaneously, resulting in overlap at the receiver. The receiver detects the transmitted tokens, assigns them to their respective sources, and mitigates token collisions by leveraging context and semantic orthogonality across the devices' messages. Simulations demonstrate that the proposed ToDMA framework outperforms context-unaware orthogonal and non-orthogonal communication methods in image transmission tasks, achieving lower latency and better image quality. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06118v1-abstract-full').style.display = 'none'; document.getElementById('2502.06118v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.04684">arXiv:2502.04684</a> <span> [<a href="https://arxiv.org/pdf/2502.04684">pdf</a>, <a href="https://arxiv.org/format/2502.04684">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> G2PDiffusion: Genotype-to-Phenotype Prediction with Diffusion Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+M">Mengdi Liu</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Zhangyang Gao</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+H">Hong Chang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S+Z">Stan Z. Li</a>, <a href="/search/cs?searchtype=author&query=Shan%2C+S">Shiguang Shan</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xilin Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.04684v2-abstract-short" style="display: inline;"> Discovering the genotype-phenotype relationship is crucial for genetic engineering, which will facilitate advances in fields such as crop breeding, conservation biology, and personalized medicine. Current research usually focuses on single species and small datasets due to limitations in phenotypic data collection, especially for traits that require visual assessments or physical measurements. Dec… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04684v2-abstract-full').style.display = 'inline'; document.getElementById('2502.04684v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.04684v2-abstract-full" style="display: none;"> Discovering the genotype-phenotype relationship is crucial for genetic engineering, which will facilitate advances in fields such as crop breeding, conservation biology, and personalized medicine. Current research usually focuses on single species and small datasets due to limitations in phenotypic data collection, especially for traits that require visual assessments or physical measurements. Deciphering complex and composite phenotypes, such as morphology, from genetic data at scale remains an open question. To break through traditional generic models that rely on simplified assumptions, this paper introduces G2PDiffusion, the first-of-its-kind diffusion model designed for genotype-to-phenotype generation across multiple species. Specifically, we use images to represent morphological phenotypes across species and redefine phenotype prediction as conditional image generation. To this end, this paper introduces an environment-enhanced DNA sequence conditioner and trains a stable diffusion model with a novel alignment method to improve genotype-to-phenotype consistency. Extensive experiments demonstrate that our approach enhances phenotype prediction accuracy across species, capturing subtle genetic variations that contribute to observable traits. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04684v2-abstract-full').style.display = 'none'; document.getElementById('2502.04684v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.03709">arXiv:2502.03709</a> <span> [<a href="https://arxiv.org/pdf/2502.03709">pdf</a>, <a href="https://arxiv.org/format/2502.03709">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> How to Make Your Multi-Image Posts Popular? An Approach to Enhanced Grid for Nine Images on Social Media </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xi%2C+Q">Qi Xi</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shulin Li</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Zhiqi Gao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zibo Zhang</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+S">Shunye Tang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jianchao Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Liangxu Wang</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+Y">Yiru Niu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yan Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Binhui Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.03709v1-abstract-short" style="display: inline;"> The nine-grid layout is commonly used for multi-image posts, arranging nine images in a tic-tac-toe board. This layout effectively presents content within limited space. Moreover, due to the numerous possible arrangements within the nine-image grid, the optimal arrangement that yields the highest level of attractiveness remains unknown. Our study investigates how the arrangement of images within a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03709v1-abstract-full').style.display = 'inline'; document.getElementById('2502.03709v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.03709v1-abstract-full" style="display: none;"> The nine-grid layout is commonly used for multi-image posts, arranging nine images in a tic-tac-toe board. This layout effectively presents content within limited space. Moreover, due to the numerous possible arrangements within the nine-image grid, the optimal arrangement that yields the highest level of attractiveness remains unknown. Our study investigates how the arrangement of images within a nine-grid layout affects the overall popularity of the image set, aiming to explore alignment schemes more aligned with user preferences. Based on survey results regarding user preferences in image arrangement, we have identified two ordering sequences that are widely recognized: sequential order and center prioritization, considering both image visual content and aesthetic quality as alignment metrics, resulting in four layout schemes. Finally, we recruited participants to annotate various layout schemes of the same set of images. Our experience-centered evaluation indicates that layout schemes based on aesthetic quality outperformed others. This research yields empirical evidence supporting the optimization of the nine-grid layout for multi-image posts, thereby furnishing content creators with valuable insights to enhance both attractiveness and user experience. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03709v1-abstract-full').style.display = 'none'; document.getElementById('2502.03709v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">to be published in Proceedings of the 2024 IEEE International Conference on Ubiquitous Intelligence and Computing</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.18897">arXiv:2501.18897</a> <span> [<a href="https://arxiv.org/pdf/2501.18897">pdf</a>, <a href="https://arxiv.org/format/2501.18897">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Trustworthy Evaluation of Generative AI Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Zijun Gao</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yan Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.18897v1-abstract-short" style="display: inline;"> Generative AI (GenAI) models have recently achieved remarkable empirical performance in various applications, however, their evaluations yet lack uncertainty quantification. In this paper, we propose a method to compare two generative models based on an unbiased estimator of their relative performance gap. Statistically, our estimator achieves parametric convergence rate and asymptotic normality,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.18897v1-abstract-full').style.display = 'inline'; document.getElementById('2501.18897v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.18897v1-abstract-full" style="display: none;"> Generative AI (GenAI) models have recently achieved remarkable empirical performance in various applications, however, their evaluations yet lack uncertainty quantification. In this paper, we propose a method to compare two generative models based on an unbiased estimator of their relative performance gap. Statistically, our estimator achieves parametric convergence rate and asymptotic normality, which enables valid inference. Computationally, our method is efficient and can be accelerated by parallel computing and leveraging pre-storing intermediate results. On simulated datasets with known ground truth, we show our approach effectively controls type I error and achieves power comparable with commonly used metrics. Furthermore, we demonstrate the performance of our method in evaluating diffusion models on real image datasets with statistical confidence. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.18897v1-abstract-full').style.display = 'none'; document.getElementById('2501.18897v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 figures, 1 table, 15 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.18865">arXiv:2501.18865</a> <span> [<a href="https://arxiv.org/pdf/2501.18865">pdf</a>, <a href="https://arxiv.org/format/2501.18865">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> REG: Rectified Gradient Guidance for Conditional Diffusion Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Zhengqi Gao</a>, <a href="/search/cs?searchtype=author&query=Zha%2C+K">Kaiwen Zha</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tianyuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+Z">Zihui Xue</a>, <a href="/search/cs?searchtype=author&query=Boning%2C+D+S">Duane S. Boning</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.18865v1-abstract-short" style="display: inline;"> Guidance techniques are simple yet effective for improving conditional generation in diffusion models. Albeit their empirical success, the practical implementation of guidance diverges significantly from its theoretical motivation. In this paper, we reconcile this discrepancy by replacing the scaled marginal distribution target, which we prove theoretically invalid, with a valid scaled joint distr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.18865v1-abstract-full').style.display = 'inline'; document.getElementById('2501.18865v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.18865v1-abstract-full" style="display: none;"> Guidance techniques are simple yet effective for improving conditional generation in diffusion models. Albeit their empirical success, the practical implementation of guidance diverges significantly from its theoretical motivation. In this paper, we reconcile this discrepancy by replacing the scaled marginal distribution target, which we prove theoretically invalid, with a valid scaled joint distribution objective. Additionally, we show that the established guidance implementations are approximations to the intractable optimal solution under no future foresight constraint. Building on these theoretical insights, we propose rectified gradient guidance (REG), a versatile enhancement designed to boost the performance of existing guidance methods. Experiments on 1D and 2D demonstrate that REG provides a better approximation to the optimal solution than prior guidance techniques, validating the proposed theoretical framework. Extensive experiments on class-conditional ImageNet and text-to-image generation tasks show that incorporating REG consistently improves FID and Inception/CLIP scores across various settings compared to its absence. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.18865v1-abstract-full').style.display = 'none'; document.getElementById('2501.18865v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">19 pages, 10 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.18292">arXiv:2501.18292</a> <span> [<a href="https://arxiv.org/pdf/2501.18292">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Digital Libraries">cs.DL</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1016/j.joi.2024.101607">10.1016/j.joi.2024.101607 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Citation Recommendation based on Argumentative Zoning of User Queries </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ma%2C+S">Shutian Ma</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chengzhi Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Heng Zhang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Zheng Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.18292v1-abstract-short" style="display: inline;"> Citation recommendation aims to locate the important papers for scholars to cite. When writing the citing sentences, the authors usually hold different citing intents, which are referred to citation function in citation analysis. Since argumentative zoning is to identify the argumentative and rhetorical structure in scientific literature, we want to use this information to improve the citation rec… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.18292v1-abstract-full').style.display = 'inline'; document.getElementById('2501.18292v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.18292v1-abstract-full" style="display: none;"> Citation recommendation aims to locate the important papers for scholars to cite. When writing the citing sentences, the authors usually hold different citing intents, which are referred to citation function in citation analysis. Since argumentative zoning is to identify the argumentative and rhetorical structure in scientific literature, we want to use this information to improve the citation recommendation task. In this paper, a multi-task learning model is built for citation recommendation and argumentative zoning classification. We also generated an annotated corpus of the data from PubMed Central based on a new argumentative zoning schema. The experimental results show that, by considering the argumentative information in the citing sentence, citation recommendation model will get better performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.18292v1-abstract-full').style.display = 'none'; document.getElementById('2501.18292v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Journal of Informetrics, 2025 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.15928">arXiv:2501.15928</a> <span> [<a href="https://arxiv.org/pdf/2501.15928">pdf</a>, <a href="https://arxiv.org/format/2501.15928">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Generative AI for Lyapunov Optimization Theory in UAV-based Low-Altitude Economy Networking </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhang Liu</a>, <a href="/search/cs?searchtype=author&query=Niyato%2C+D">Dusit Niyato</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jiacheng Wang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+G">Geng Sun</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+L">Lianfen Huang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Zhibin Gao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xianbin Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.15928v1-abstract-short" style="display: inline;"> Lyapunov optimization theory has recently emerged as a powerful mathematical framework for solving complex stochastic optimization problems by transforming long-term objectives into a sequence of real-time short-term decisions while ensuring system stability. This theory is particularly valuable in unmanned aerial vehicle (UAV)-based low-altitude economy (LAE) networking scenarios, where it could… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15928v1-abstract-full').style.display = 'inline'; document.getElementById('2501.15928v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.15928v1-abstract-full" style="display: none;"> Lyapunov optimization theory has recently emerged as a powerful mathematical framework for solving complex stochastic optimization problems by transforming long-term objectives into a sequence of real-time short-term decisions while ensuring system stability. This theory is particularly valuable in unmanned aerial vehicle (UAV)-based low-altitude economy (LAE) networking scenarios, where it could effectively address inherent challenges of dynamic network conditions, multiple optimization objectives, and stability requirements. Recently, generative artificial intelligence (GenAI) has garnered significant attention for its unprecedented capability to generate diverse digital content. Extending beyond content generation, in this paper, we propose a framework integrating generative diffusion models with reinforcement learning to address Lyapunov optimization problems in UAV-based LAE networking. We begin by introducing the fundamentals of Lyapunov optimization theory and analyzing the limitations of both conventional methods and traditional AI-enabled approaches. We then examine various GenAI models and comprehensively analyze their potential contributions to Lyapunov optimization. Subsequently, we develop a Lyapunov-guided generative diffusion model-based reinforcement learning framework and validate its effectiveness through a UAV-based LAE networking case study. Finally, we outline several directions for future research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15928v1-abstract-full').style.display = 'none'; document.getElementById('2501.15928v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 5 figures, magazine paper</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.15235">arXiv:2501.15235</a> <span> [<a href="https://arxiv.org/pdf/2501.15235">pdf</a>, <a href="https://arxiv.org/ps/2501.15235">ps</a>, <a href="https://arxiv.org/format/2501.15235">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Large-Scale Riemannian Meta-Optimization via Subspace Adaptation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+P">Peilin Yu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yuwei Wu</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Zhi Gao</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+X">Xiaomeng Fan</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+Y">Yunde Jia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.15235v2-abstract-short" style="display: inline;"> Riemannian meta-optimization provides a promising approach to solving non-linear constrained optimization problems, which trains neural networks as optimizers to perform optimization on Riemannian manifolds. However, existing Riemannian meta-optimization methods take up huge memory footprints in large-scale optimization settings, as the learned optimizer can only adapt gradients of a fixed size an… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15235v2-abstract-full').style.display = 'inline'; document.getElementById('2501.15235v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.15235v2-abstract-full" style="display: none;"> Riemannian meta-optimization provides a promising approach to solving non-linear constrained optimization problems, which trains neural networks as optimizers to perform optimization on Riemannian manifolds. However, existing Riemannian meta-optimization methods take up huge memory footprints in large-scale optimization settings, as the learned optimizer can only adapt gradients of a fixed size and thus cannot be shared across different Riemannian parameters. In this paper, we propose an efficient Riemannian meta-optimization method that significantly reduces the memory burden for large-scale optimization via a subspace adaptation scheme. Our method trains neural networks to individually adapt the row and column subspaces of Riemannian gradients, instead of directly adapting the full gradient matrices in existing Riemannian meta-optimization methods. In this case, our learned optimizer can be shared across Riemannian parameters with different sizes. Our method reduces the model memory consumption by six orders of magnitude when optimizing an orthogonal mainstream deep neural network (e.g., ResNet50). Experiments on multiple Riemannian tasks show that our method can not only reduce the memory consumption but also improve the performance of Riemannian meta-optimization. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15235v2-abstract-full').style.display = 'none'; document.getElementById('2501.15235v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by CVIU</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.12948">arXiv:2501.12948</a> <span> [<a href="https://arxiv.org/pdf/2501.12948">pdf</a>, <a href="https://arxiv.org/format/2501.12948">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=DeepSeek-AI"> DeepSeek-AI</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+D">Daya Guo</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+D">Dejian Yang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Haowei Zhang</a>, <a href="/search/cs?searchtype=author&query=Song%2C+J">Junxiao Song</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Ruoyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+R">Runxin Xu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Q">Qihao Zhu</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+S">Shirong Ma</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+P">Peiyi Wang</a>, <a href="/search/cs?searchtype=author&query=Bi%2C+X">Xiao Bi</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiaokang Zhang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+X">Xingkai Yu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yu Wu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Z+F">Z. F. Wu</a>, <a href="/search/cs?searchtype=author&query=Gou%2C+Z">Zhibin Gou</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+Z">Zhihong Shao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhuoshu Li</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Ziyi Gao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+A">Aixin Liu</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+B">Bing Xue</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bingxuan Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+B">Bochao Wu</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+B">Bei Feng</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+C">Chengda Lu</a> , et al. (175 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.12948v1-abstract-short" style="display: inline;"> We introduce our first-generation reasoning models, DeepSeek-R1-Zero and DeepSeek-R1. DeepSeek-R1-Zero, a model trained via large-scale reinforcement learning (RL) without supervised fine-tuning (SFT) as a preliminary step, demonstrates remarkable reasoning capabilities. Through RL, DeepSeek-R1-Zero naturally emerges with numerous powerful and intriguing reasoning behaviors. However, it encounters… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12948v1-abstract-full').style.display = 'inline'; document.getElementById('2501.12948v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.12948v1-abstract-full" style="display: none;"> We introduce our first-generation reasoning models, DeepSeek-R1-Zero and DeepSeek-R1. DeepSeek-R1-Zero, a model trained via large-scale reinforcement learning (RL) without supervised fine-tuning (SFT) as a preliminary step, demonstrates remarkable reasoning capabilities. Through RL, DeepSeek-R1-Zero naturally emerges with numerous powerful and intriguing reasoning behaviors. However, it encounters challenges such as poor readability, and language mixing. To address these issues and further enhance reasoning performance, we introduce DeepSeek-R1, which incorporates multi-stage training and cold-start data before RL. DeepSeek-R1 achieves performance comparable to OpenAI-o1-1217 on reasoning tasks. To support the research community, we open-source DeepSeek-R1-Zero, DeepSeek-R1, and six dense models (1.5B, 7B, 8B, 14B, 32B, 70B) distilled from DeepSeek-R1 based on Qwen and Llama. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12948v1-abstract-full').style.display = 'none'; document.getElementById('2501.12948v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.12222">arXiv:2501.12222</a> <span> [<a href="https://arxiv.org/pdf/2501.12222">pdf</a>, <a href="https://arxiv.org/ps/2501.12222">ps</a>, <a href="https://arxiv.org/format/2501.12222">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Superconductivity">cond-mat.supr-con</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Materials Science">cond-mat.mtrl-sci</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computational Physics">physics.comp-ph</span> </div> </div> <p class="title is-5 mathjax"> Strong phonon-mediated high temperature superconductivity in Li$_2$AuH$_6$ under ambient pressure </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ouyang%2C+Z">Zhenfeng Ouyang</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+B">Bo-Wen Yao</a>, <a href="/search/cs?searchtype=author&query=Han%2C+X">Xiao-Qi Han</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+P">Peng-Jie Guo</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Ze-Feng Gao</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Z">Zhong-Yi Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.12222v1-abstract-short" style="display: inline;"> We used our developed AI search engine~(InvDesFlow) to perform extensive investigations regarding ambient stable superconducting hydrides. A cubic structure Li$_2$AuH$_6$ with Au-H octahedral motifs is identified to be a candidate. After performing thermodynamical analysis, we provide a feasible route to experimentally synthesize this material via the known LiAu and LiH compounds under ambient pre… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12222v1-abstract-full').style.display = 'inline'; document.getElementById('2501.12222v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.12222v1-abstract-full" style="display: none;"> We used our developed AI search engine~(InvDesFlow) to perform extensive investigations regarding ambient stable superconducting hydrides. A cubic structure Li$_2$AuH$_6$ with Au-H octahedral motifs is identified to be a candidate. After performing thermodynamical analysis, we provide a feasible route to experimentally synthesize this material via the known LiAu and LiH compounds under ambient pressure. The further first-principles calculations suggest that Li$_2$AuH$_6$ shows a high superconducting transition temperature ($T_c$) $\sim$ 140 K under ambient pressure. The H-1$s$ electrons strongly couple with phonon modes of vibrations of Au-H octahedrons as well as vibrations of Li atoms, where the latter is not taken seriously in other previously similar cases. Hence, different from previous claims of searching metallic covalent bonds to find high-$T_c$ superconductors, we emphasize here the importance of those phonon modes with strong electron-phonon coupling (EPC). And we suggest that one can intercalate atoms into binary or ternary hydrides to introduce more potential phonon modes with strong EPC, which is an effective approach to find high-$T_c$ superconductors within multicomponent compounds. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12222v1-abstract-full').style.display = 'none'; document.getElementById('2501.12222v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">6 pages; 4 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.09957">arXiv:2501.09957</a> <span> [<a href="https://arxiv.org/pdf/2501.09957">pdf</a>, <a href="https://arxiv.org/format/2501.09957">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> FRAG: A Flexible Modular Framework for Retrieval-Augmented Generation based on Knowledge Graphs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Zengyi Gao</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+Y">Yukun Cao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hairu Wang</a>, <a href="/search/cs?searchtype=author&query=Ke%2C+A">Ao Ke</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Y">Yuan Feng</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+X">Xike Xie</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+S+K">S Kevin Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.09957v2-abstract-short" style="display: inline;"> To mitigate the hallucination and knowledge deficiency in large language models (LLMs), Knowledge Graph (KG)-based Retrieval-Augmented Generation (RAG) has shown promising potential by utilizing KGs as external resource to enhance LLMs reasoning. However, existing KG-RAG approaches struggle with a trade-off between flexibility and retrieval quality. Modular methods prioritize flexibility by avoidi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.09957v2-abstract-full').style.display = 'inline'; document.getElementById('2501.09957v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.09957v2-abstract-full" style="display: none;"> To mitigate the hallucination and knowledge deficiency in large language models (LLMs), Knowledge Graph (KG)-based Retrieval-Augmented Generation (RAG) has shown promising potential by utilizing KGs as external resource to enhance LLMs reasoning. However, existing KG-RAG approaches struggle with a trade-off between flexibility and retrieval quality. Modular methods prioritize flexibility by avoiding the use of KG-fine-tuned models during retrieval, leading to fixed retrieval strategies and suboptimal retrieval quality. Conversely, coupled methods embed KG information within models to improve retrieval quality, but at the expense of flexibility. In this paper, we propose a novel flexible modular KG-RAG framework, termed FRAG, which synergizes the advantages of both approaches. FRAG estimates the hop range of reasoning paths based solely on the query and classify it as either simple or complex. To match the complexity of the query, tailored pipelines are applied to ensure efficient and accurate reasoning path retrieval, thus fostering the final reasoning process. By using the query text instead of the KG to infer the structural information of reasoning paths and employing adaptable retrieval strategies, FRAG improves retrieval quality while maintaining flexibility. Moreover, FRAG does not require extra LLMs fine-tuning or calls, significantly boosting efficiency and conserving resources. Extensive experiments show that FRAG achieves state-of-the-art performance with high efficiency and low resource consumption. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.09957v2-abstract-full').style.display = 'none'; document.getElementById('2501.09957v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.08941">arXiv:2501.08941</a> <span> [<a href="https://arxiv.org/pdf/2501.08941">pdf</a>, <a href="https://arxiv.org/format/2501.08941">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.2514/6.2025-2118">10.2514/6.2025-2118 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> A Reinforcement Learning Approach to Quiet and Safe UAM Traffic Management </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Murthy%2C+S">Surya Murthy</a>, <a href="/search/cs?searchtype=author&query=Clarke%2C+J">John-Paul Clarke</a>, <a href="/search/cs?searchtype=author&query=Topcu%2C+U">Ufuk Topcu</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Zhenyu Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.08941v1-abstract-short" style="display: inline;"> Urban air mobility (UAM) is a transformative system that operates various small aerial vehicles in urban environments to reshape urban transportation. However, integrating UAM into existing urban environments presents a variety of complex challenges. Recent analyses of UAM's operational constraints highlight aircraft noise and system safety as key hurdles to UAM system implementation. Future UAM a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.08941v1-abstract-full').style.display = 'inline'; document.getElementById('2501.08941v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.08941v1-abstract-full" style="display: none;"> Urban air mobility (UAM) is a transformative system that operates various small aerial vehicles in urban environments to reshape urban transportation. However, integrating UAM into existing urban environments presents a variety of complex challenges. Recent analyses of UAM's operational constraints highlight aircraft noise and system safety as key hurdles to UAM system implementation. Future UAM air traffic management schemes must ensure that the system is both quiet and safe. We propose a multi-agent reinforcement learning approach to manage UAM traffic, aiming at both vertical separation assurance and noise mitigation. Through extensive training, the reinforcement learning agent learns to balance the two primary objectives by employing altitude adjustments in a multi-layer UAM network. The results reveal the tradeoffs among noise impact, traffic congestion, and separation. Overall, our findings demonstrate the potential of reinforcement learning in mitigating UAM's noise impact while maintaining safe separation using altitude adjustments <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.08941v1-abstract-full').style.display = 'none'; document.getElementById('2501.08941v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Paper presented at SciTech 2025</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> AIAA SciTech 2025 Forum </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.08670">arXiv:2501.08670</a> <span> [<a href="https://arxiv.org/pdf/2501.08670">pdf</a>, <a href="https://arxiv.org/format/2501.08670">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> Augmenting Smart Contract Decompiler Output through Fine-grained Dependency Analysis and LLM-facilitated Semantic Recovery </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liao%2C+Z">Zeqin Liao</a>, <a href="/search/cs?searchtype=author&query=Nan%2C+Y">Yuhong Nan</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Zixu Gao</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+H">Henglong Liang</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+S">Sicheng Hao</a>, <a href="/search/cs?searchtype=author&query=Reng%2C+P">Peifan Reng</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Z">Zibin Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.08670v1-abstract-short" style="display: inline;"> Decompiler is a specialized type of reverse engineering tool extensively employed in program analysis tasks, particularly in program comprehension and vulnerability detection. However, current Solidity smart contract decompilers face significant limitations in reconstructing the original source code. In particular, the bottleneck of SOTA decompilers lies in inaccurate method identification, incorr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.08670v1-abstract-full').style.display = 'inline'; document.getElementById('2501.08670v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.08670v1-abstract-full" style="display: none;"> Decompiler is a specialized type of reverse engineering tool extensively employed in program analysis tasks, particularly in program comprehension and vulnerability detection. However, current Solidity smart contract decompilers face significant limitations in reconstructing the original source code. In particular, the bottleneck of SOTA decompilers lies in inaccurate method identification, incorrect variable type recovery, and missing contract attributes. These deficiencies hinder downstream tasks and understanding of the program logic. To address these challenges, we propose SmartHalo, a new framework that enhances decompiler output by combining static analysis (SA) and large language models (LLM). SmartHalo leverages the complementary strengths of SA's accuracy in control and data flow analysis and LLM's capability in semantic prediction. More specifically, \system{} constructs a new data structure - Dependency Graph (DG), to extract semantic dependencies via static analysis. Then, it takes DG to create prompts for LLM optimization. Finally, the correctness of LLM outputs is validated through symbolic execution and formal verification. Evaluation on a dataset consisting of 465 randomly selected smart contract methods shows that SmartHalo significantly improves the quality of the decompiled code, compared to SOTA decompilers (e.g., Gigahorse). Notably, integrating GPT-4o with SmartHalo further enhances its performance, achieving precision rates of 87.39% for method boundaries, 90.39% for variable types, and 80.65% for contract attributes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.08670v1-abstract-full').style.display = 'none'; document.getElementById('2501.08670v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.08585">arXiv:2501.08585</a> <span> [<a href="https://arxiv.org/pdf/2501.08585">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> A Systematic Review of Machine Learning Methods for Multimodal EEG Data in Clinical Application </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Siqi Zhao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wangyang Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiru Wang</a>, <a href="/search/cs?searchtype=author&query=Foglia%2C+S">Stevie Foglia</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+H">Hongzhao Tan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Bohan Zhang</a>, <a href="/search/cs?searchtype=author&query=Hamoodi%2C+A">Ameer Hamoodi</a>, <a href="/search/cs?searchtype=author&query=Nelson%2C+A">Aimee Nelson</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Zhen Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.08585v1-abstract-short" style="display: inline;"> Machine learning (ML) and deep learning (DL) techniques have been widely applied to analyze electroencephalography (EEG) signals for disease diagnosis and brain-computer interfaces (BCI). The integration of multimodal data has been shown to enhance the accuracy of ML and DL models. Combining EEG with other modalities can improve clinical decision-making by addressing complex tasks in clinical popu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.08585v1-abstract-full').style.display = 'inline'; document.getElementById('2501.08585v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.08585v1-abstract-full" style="display: none;"> Machine learning (ML) and deep learning (DL) techniques have been widely applied to analyze electroencephalography (EEG) signals for disease diagnosis and brain-computer interfaces (BCI). The integration of multimodal data has been shown to enhance the accuracy of ML and DL models. Combining EEG with other modalities can improve clinical decision-making by addressing complex tasks in clinical populations. This systematic literature review explores the use of multimodal EEG data in ML and DL models for clinical applications. A comprehensive search was conducted across PubMed, Web of Science, and Google Scholar, yielding 16 relevant studies after three rounds of filtering. These studies demonstrate the application of multimodal EEG data in addressing clinical challenges, including neuropsychiatric disorders, neurological conditions (e.g., seizure detection), neurodevelopmental disorders (e.g., autism spectrum disorder), and sleep stage classification. Data fusion occurred at three levels: signal, feature, and decision levels. The most commonly used ML models were support vector machines (SVM) and decision trees. Notably, 11 out of the 16 studies reported improvements in model accuracy with multimodal EEG data. This review highlights the potential of multimodal EEG-based ML models in enhancing clinical diagnostics and problem-solving. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.08585v1-abstract-full').style.display = 'none'; document.getElementById('2501.08585v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper includes 4 figures, 6 tables, and totals 18 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.08501">arXiv:2501.08501</a> <span> [<a href="https://arxiv.org/pdf/2501.08501">pdf</a>, <a href="https://arxiv.org/format/2501.08501">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Numerical Analysis">math.NA</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Scalable Bayesian Physics-Informed Kolmogorov-Arnold Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Zhiwei Gao</a>, <a href="/search/cs?searchtype=author&query=Karniadakis%2C+G+E">George Em Karniadakis</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.08501v2-abstract-short" style="display: inline;"> Uncertainty quantification (UQ) plays a pivotal role in scientific machine learning, especially when surrogate models are used to approximate complex systems. Although multilayer perceptions (MLPs) are commonly employed as surrogates, they often suffer from overfitting due to their large number of parameters. Kolmogorov-Arnold networks (KANs) offer an alternative solution with fewer parameters. Ho… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.08501v2-abstract-full').style.display = 'inline'; document.getElementById('2501.08501v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.08501v2-abstract-full" style="display: none;"> Uncertainty quantification (UQ) plays a pivotal role in scientific machine learning, especially when surrogate models are used to approximate complex systems. Although multilayer perceptions (MLPs) are commonly employed as surrogates, they often suffer from overfitting due to their large number of parameters. Kolmogorov-Arnold networks (KANs) offer an alternative solution with fewer parameters. However, gradient-based inference methods, such as Hamiltonian Monte Carlo (HMC), may result in computational inefficiency when applied to KANs, especially for large-scale datasets, due to the high cost of back-propagation. To address these challenges, we propose a novel approach, combining the dropout Tikhonov ensemble Kalman inversion (DTEKI) with Chebyshev KANs. This gradient-free method effectively mitigates overfitting and enhances numerical stability. Additionally, we incorporate the active subspace method to reduce the parameter-space dimensionality, allowing us to improve the accuracy of predictions and obtain more reliable uncertainty estimates. Extensive experiments demonstrate the efficacy of our approach in various test cases, including scenarios with large datasets and high noise levels. Our results show that the new method achieves comparable or better accuracy, much higher efficiency as well as stability compared to HMC, in addition to scalability. Moreover, by leveraging the low-dimensional parameter subspace, our method preserves prediction accuracy while substantially reducing further the computational cost. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.08501v2-abstract-full').style.display = 'none'; document.getElementById('2501.08501v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">28 pages, 19 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.06282">arXiv:2501.06282</a> <span> [<a href="https://arxiv.org/pdf/2501.06282">pdf</a>, <a href="https://arxiv.org/format/2501.06282">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> MinMo: A Multimodal Large Language Model for Seamless Voice Interaction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Q">Qian Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yafeng Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yanni Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+M">Mengzhe Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yingda Chen</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+C">Chong Deng</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Z">Zhihao Du</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+R">Ruize Gao</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+C">Changfeng Gao</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Zhifu Gao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yabin Li</a>, <a href="/search/cs?searchtype=author&query=Lv%2C+X">Xiang Lv</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jiaqing Liu</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+H">Haoneng Luo</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+B">Bin Ma</a>, <a href="/search/cs?searchtype=author&query=Ni%2C+C">Chongjia Ni</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+X">Xian Shi</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+J">Jialong Tang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hui Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hao Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wen Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yuxuan Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yunlan Xu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+F">Fan Yu</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+Z">Zhijie Yan</a> , et al. (11 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.06282v1-abstract-short" style="display: inline;"> Recent advancements in large language models (LLMs) and multimodal speech-text models have laid the groundwork for seamless voice interactions, enabling real-time, natural, and human-like conversations. Previous models for voice interactions are categorized as native and aligned. Native models integrate speech and text processing in one framework but struggle with issues like differing sequence le… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06282v1-abstract-full').style.display = 'inline'; document.getElementById('2501.06282v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.06282v1-abstract-full" style="display: none;"> Recent advancements in large language models (LLMs) and multimodal speech-text models have laid the groundwork for seamless voice interactions, enabling real-time, natural, and human-like conversations. Previous models for voice interactions are categorized as native and aligned. Native models integrate speech and text processing in one framework but struggle with issues like differing sequence lengths and insufficient pre-training. Aligned models maintain text LLM capabilities but are often limited by small datasets and a narrow focus on speech tasks. In this work, we introduce MinMo, a Multimodal Large Language Model with approximately 8B parameters for seamless voice interaction. We address the main limitations of prior aligned multimodal models. We train MinMo through multiple stages of speech-to-text alignment, text-to-speech alignment, speech-to-speech alignment, and duplex interaction alignment, on 1.4 million hours of diverse speech data and a broad range of speech tasks. After the multi-stage training, MinMo achieves state-of-the-art performance across various benchmarks for voice comprehension and generation while maintaining the capabilities of text LLMs, and also facilitates full-duplex conversation, that is, simultaneous two-way communication between the user and the system. Moreover, we propose a novel and simple voice decoder that outperforms prior models in voice generation. The enhanced instruction-following capabilities of MinMo supports controlling speech generation based on user instructions, with various nuances including emotions, dialects, and speaking rates, and mimicking specific voices. For MinMo, the speech-to-text latency is approximately 100ms, full-duplex latency is approximately 600ms in theory and 800ms in practice. The MinMo project web page is https://funaudiollm.github.io/minmo, and the code and models will be released soon. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06282v1-abstract-full').style.display = 'none'; document.getElementById('2501.06282v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Work in progress. Authors are listed in alphabetical order by family name</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.04996">arXiv:2501.04996</a> <span> [<a href="https://arxiv.org/pdf/2501.04996">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> A CT Image Classification Network Framework for Lung Tumors Based on Pre-trained MobileNetV2 Model and Transfer learning, And Its Application and Market Analysis in the Medical field </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Ziyang Gao</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+Y">Yong Tian</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+S">Shih-Chi Lin</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+J">Junghua Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.04996v1-abstract-short" style="display: inline;"> In the medical field, accurate diagnosis of lung cancer is crucial for treatment. Traditional manual analysis methods have significant limitations in terms of accuracy and efficiency. To address this issue, this paper proposes a deep learning network framework based on the pre-trained MobileNetV2 model, initialized with weights from the ImageNet-1K dataset (version 2). The last layer of the model… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.04996v1-abstract-full').style.display = 'inline'; document.getElementById('2501.04996v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.04996v1-abstract-full" style="display: none;"> In the medical field, accurate diagnosis of lung cancer is crucial for treatment. Traditional manual analysis methods have significant limitations in terms of accuracy and efficiency. To address this issue, this paper proposes a deep learning network framework based on the pre-trained MobileNetV2 model, initialized with weights from the ImageNet-1K dataset (version 2). The last layer of the model (the fully connected layer) is replaced with a new fully connected layer, and a softmax activation function is added to efficiently classify three types of lung cancer CT scan images. Experimental results show that the model achieves an accuracy of 99.6% on the test set, with significant improvements in feature extraction compared to traditional models.With the rapid development of artificial intelligence technologies, deep learning applications in medical image processing are bringing revolutionary changes to the healthcare industry. AI-based lung cancer detection systems can significantly improve diagnostic efficiency, reduce the workload of doctors, and occupy an important position in the global healthcare market. The potential of AI to improve diagnostic accuracy, reduce medical costs, and promote precision medicine will have a profound impact on the future development of the healthcare industry. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.04996v1-abstract-full').style.display = 'none'; document.getElementById('2501.04996v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.04643">arXiv:2501.04643</a> <span> [<a href="https://arxiv.org/pdf/2501.04643">pdf</a>, <a href="https://arxiv.org/format/2501.04643">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Discrete Wavelet Transform-Based Capsule Network for Hyperspectral Image Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Zhiqiang Gao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jiaqi Wang</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+H">Hangchi Shen</a>, <a href="/search/cs?searchtype=author&query=Dou%2C+Z">Zhihao Dou</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiangbo Zhang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+K">Kaizhu Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.04643v1-abstract-short" style="display: inline;"> Hyperspectral image (HSI) classification is a crucial technique for remote sensing to build large-scale earth monitoring systems. HSI contains much more information than traditional visual images for identifying the categories of land covers. One recent feasible solution for HSI is to leverage CapsNets for capturing spectral-spatial information. However, these methods require high computational re… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.04643v1-abstract-full').style.display = 'inline'; document.getElementById('2501.04643v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.04643v1-abstract-full" style="display: none;"> Hyperspectral image (HSI) classification is a crucial technique for remote sensing to build large-scale earth monitoring systems. HSI contains much more information than traditional visual images for identifying the categories of land covers. One recent feasible solution for HSI is to leverage CapsNets for capturing spectral-spatial information. However, these methods require high computational requirements due to the full connection architecture between stacked capsule layers. To solve this problem, a DWT-CapsNet is proposed to identify partial but important connections in CapsNet for a effective and efficient HSI classification. Specifically, we integrate a tailored attention mechanism into a Discrete Wavelet Transform (DWT)-based downsampling layer, alleviating the information loss problem of conventional downsampling operation in feature extractors. Moreover, we propose a novel multi-scale routing algorithm that prunes a large proportion of connections in CapsNet. A capsule pyramid fusion mechanism is designed to aggregate the spectral-spatial relationships in multiple levels of granularity, and then a self-attention mechanism is further conducted in a partially and locally connected architecture to emphasize the meaningful relationships. As shown in the experimental results, our method achieves state-of-the-art accuracy while keeping lower computational demand regarding running time, flops, and the number of parameters, rendering it an appealing choice for practical implementation in HSI classification. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.04643v1-abstract-full').style.display = 'none'; document.getElementById('2501.04643v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">28 Pages; 9 Figure</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.02738">arXiv:2501.02738</a> <span> [<a href="https://arxiv.org/pdf/2501.02738">pdf</a>, <a href="https://arxiv.org/format/2501.02738">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> </div> </div> <p class="title is-5 mathjax"> SCSC: A Novel Standards-Compatible Semantic Communication Framework for Image Transmission </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Han%2C+X">Xue Han</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yongpeng Wu</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Zhen Gao</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+B">Biqian Feng</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yuxuan Shi</a>, <a href="/search/cs?searchtype=author&query=G%C3%BCnd%C3%BCz%2C+D">Deniz G眉nd眉z</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wenjun Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.02738v1-abstract-short" style="display: inline;"> Joint source-channel coding (JSCC) is a promising paradigm for next-generation communication systems, particularly in challenging transmission environments. In this paper, we propose a novel standard-compatible JSCC framework for the transmission of images over multiple-input multiple-output (MIMO) channels. Different from the existing end-to-end AI-based DeepJSCC schemes, our framework consists o… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02738v1-abstract-full').style.display = 'inline'; document.getElementById('2501.02738v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.02738v1-abstract-full" style="display: none;"> Joint source-channel coding (JSCC) is a promising paradigm for next-generation communication systems, particularly in challenging transmission environments. In this paper, we propose a novel standard-compatible JSCC framework for the transmission of images over multiple-input multiple-output (MIMO) channels. Different from the existing end-to-end AI-based DeepJSCC schemes, our framework consists of learnable modules that enable communication using conventional separate source and channel codes (SSCC), which makes it amenable for easy deployment on legacy systems. Specifically, the learnable modules involve a preprocessing-empowered network (PPEN) for preserving essential semantic information, and a precoder \& combiner-enhanced network (PCEN) for efficient transmission over a resource-constrained MIMO channel. We treat existing compression and channel coding modules as non-trainable blocks. Since the parameters of these modules are non-differentiable, we employ a proxy network that mimics their operations when training the learnable modules. Numerical results demonstrate that our scheme can save more than 29\% of the channel bandwidth, and requires lower complexity compared to the constrained baselines. We also show its generalization capability to unseen datasets and tasks through extensive experiments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02738v1-abstract-full').style.display = 'none'; document.getElementById('2501.02738v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by IEEE Transactions on Communications</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.00013">arXiv:2501.00013</a> <span> [<a href="https://arxiv.org/pdf/2501.00013">pdf</a>, <a href="https://arxiv.org/format/2501.00013">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantitative Methods">q-bio.QM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Relation-Aware Equivariant Graph Networks for Epitope-Unknown Antibody Design and Specificity Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+L">Lirong Wu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H">Haitao Lin</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yufei Huang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Zhangyang Gao</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+C">Cheng Tan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yunfan Liu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+T">Tailin Wu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S+Z">Stan Z. Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.00013v1-abstract-short" style="display: inline;"> Antibodies are Y-shaped proteins that protect the host by binding to specific antigens, and their binding is mainly determined by the Complementary Determining Regions (CDRs) in the antibody. Despite the great progress made in CDR design, existing computational methods still encounter several challenges: 1) poor capability of modeling complex CDRs with long sequences due to insufficient contextual… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.00013v1-abstract-full').style.display = 'inline'; document.getElementById('2501.00013v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.00013v1-abstract-full" style="display: none;"> Antibodies are Y-shaped proteins that protect the host by binding to specific antigens, and their binding is mainly determined by the Complementary Determining Regions (CDRs) in the antibody. Despite the great progress made in CDR design, existing computational methods still encounter several challenges: 1) poor capability of modeling complex CDRs with long sequences due to insufficient contextual information; 2) conditioned on pre-given antigenic epitopes and their static interaction with the target antibody; 3) neglect of specificity during antibody optimization leads to non-specific antibodies. In this paper, we take into account a variety of node features, edge features, and edge relations to include more contextual and geometric information. We propose a novel Relation-Aware Antibody Design (RAAD) framework, which dynamically models antigen-antibody interactions for co-designing the sequences and structures of antigen-specific CDRs. Furthermore, we propose a new evaluation metric to better measure antibody specificity and develop a contrasting specificity-enhancing constraint to optimize the specificity of antibodies. Extensive experiments have demonstrated the superior capability of RAAD in terms of antibody modeling, generation, and optimization across different CDR types, sequence lengths, pre-training strategies, and input contexts. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.00013v1-abstract-full').style.display = 'none'; document.getElementById('2501.00013v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.19437">arXiv:2412.19437</a> <span> [<a href="https://arxiv.org/pdf/2412.19437">pdf</a>, <a href="https://arxiv.org/format/2412.19437">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> DeepSeek-V3 Technical Report </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=DeepSeek-AI"> DeepSeek-AI</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+A">Aixin Liu</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+B">Bei Feng</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+B">Bing Xue</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bingxuan Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+B">Bochao Wu</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+C">Chengda Lu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+C">Chenggang Zhao</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+C">Chengqi Deng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chenyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Ruan%2C+C">Chong Ruan</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+D">Damai Dai</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+D">Daya Guo</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+D">Dejian Yang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+D">Deli Chen</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+D">Dongjie Ji</a>, <a href="/search/cs?searchtype=author&query=Li%2C+E">Erhang Li</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+F">Fangyun Lin</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+F">Fucong Dai</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+F">Fuli Luo</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+G">Guangbo Hao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+G">Guanting Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Guowei Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">H. Zhang</a>, <a href="/search/cs?searchtype=author&query=Bao%2C+H">Han Bao</a> , et al. (175 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.19437v2-abstract-short" style="display: inline;"> We present DeepSeek-V3, a strong Mixture-of-Experts (MoE) language model with 671B total parameters with 37B activated for each token. To achieve efficient inference and cost-effective training, DeepSeek-V3 adopts Multi-head Latent Attention (MLA) and DeepSeekMoE architectures, which were thoroughly validated in DeepSeek-V2. Furthermore, DeepSeek-V3 pioneers an auxiliary-loss-free strategy for loa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.19437v2-abstract-full').style.display = 'inline'; document.getElementById('2412.19437v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.19437v2-abstract-full" style="display: none;"> We present DeepSeek-V3, a strong Mixture-of-Experts (MoE) language model with 671B total parameters with 37B activated for each token. To achieve efficient inference and cost-effective training, DeepSeek-V3 adopts Multi-head Latent Attention (MLA) and DeepSeekMoE architectures, which were thoroughly validated in DeepSeek-V2. Furthermore, DeepSeek-V3 pioneers an auxiliary-loss-free strategy for load balancing and sets a multi-token prediction training objective for stronger performance. We pre-train DeepSeek-V3 on 14.8 trillion diverse and high-quality tokens, followed by Supervised Fine-Tuning and Reinforcement Learning stages to fully harness its capabilities. Comprehensive evaluations reveal that DeepSeek-V3 outperforms other open-source models and achieves performance comparable to leading closed-source models. Despite its excellent performance, DeepSeek-V3 requires only 2.788M H800 GPU hours for its full training. In addition, its training process is remarkably stable. Throughout the entire training process, we did not experience any irrecoverable loss spikes or perform any rollbacks. The model checkpoints are available at https://github.com/deepseek-ai/DeepSeek-V3. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.19437v2-abstract-full').style.display = 'none'; document.getElementById('2412.19437v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.16225">arXiv:2412.16225</a> <span> [<a href="https://arxiv.org/pdf/2412.16225">pdf</a>, <a href="https://arxiv.org/format/2412.16225">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> </div> <p class="title is-5 mathjax"> Bayesian Critique-Tune-Based Reinforcement Learning with Adaptive Pressure for Multi-Intersection Traffic Signal Control </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Duan%2C+W">Wenchang Duan</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Zhenguo Gao</a>, <a href="/search/cs?searchtype=author&query=He%2C+J">Jiwan He</a>, <a href="/search/cs?searchtype=author&query=Xian%2C+J">Jinguo Xian</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.16225v2-abstract-short" style="display: inline;"> Adaptive Traffic Signal Control (ATSC) system is a critical component of intelligent transportation, with the capability to significantly alleviate urban traffic congestion. Although reinforcement learning (RL)-based methods have demonstrated promising performance in achieving ATSC, existing methods are still prone to making unreasonable policies. Therefore, this paper proposes a novel Bayesian Cr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16225v2-abstract-full').style.display = 'inline'; document.getElementById('2412.16225v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.16225v2-abstract-full" style="display: none;"> Adaptive Traffic Signal Control (ATSC) system is a critical component of intelligent transportation, with the capability to significantly alleviate urban traffic congestion. Although reinforcement learning (RL)-based methods have demonstrated promising performance in achieving ATSC, existing methods are still prone to making unreasonable policies. Therefore, this paper proposes a novel Bayesian Critique-Tune-Based Reinforcement Learning with Adaptive Pressure for multi-intersection signal control (BCT-APLight). In BCT-APLight, the Critique-Tune (CT) framework, a two-layer Bayesian structure is designed to refine the excessive trust of RL policies. Specifically, the Bayesian inference-based Critique Layer provides effective evaluations of the credibility of policies; the Bayesian decision-based Tune Layer fine-tunes policies by minimizing the posterior risks when the evaluations are negative. Meanwhile, an attention-based Adaptive Pressure (AP) mechanism is designed to effectively weight the vehicle queues in each lane, thereby enhancing the rationality of traffic movement representation within the network. Equipped with the CT framework and AP mechanism, BCT-APLight effectively enhances the reasonableness of RL policies. Extensive experiments conducted with a simulator across a range of intersection layouts demonstrate that BCT-APLight is superior to other state-of-the-art (SOTA) methods on seven real-world datasets. Specifically, BCT-APLight decreases average queue length by \textbf{$\boldsymbol{9.60\%}$} and average waiting time by \textbf{$\boldsymbol{15.28\%}$}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16225v2-abstract-full').style.display = 'none'; document.getElementById('2412.16225v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.15924">arXiv:2412.15924</a> <span> [<a href="https://arxiv.org/pdf/2412.15924">pdf</a>, <a href="https://arxiv.org/format/2412.15924">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Watertox: The Art of Simplicity in Universal Attacks A Cross-Model Framework for Robust Adversarial Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Zhenghao Gao</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+S">Shengjie Xu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+M">Meixi Chen</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+F">Fangyao Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.15924v1-abstract-short" style="display: inline;"> Contemporary adversarial attack methods face significant limitations in cross-model transferability and practical applicability. We present Watertox, an elegant adversarial attack framework achieving remarkable effectiveness through architectural diversity and precision-controlled perturbations. Our two-stage Fast Gradient Sign Method combines uniform baseline perturbations ($蔚_1 = 0.1$) with targ… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.15924v1-abstract-full').style.display = 'inline'; document.getElementById('2412.15924v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.15924v1-abstract-full" style="display: none;"> Contemporary adversarial attack methods face significant limitations in cross-model transferability and practical applicability. We present Watertox, an elegant adversarial attack framework achieving remarkable effectiveness through architectural diversity and precision-controlled perturbations. Our two-stage Fast Gradient Sign Method combines uniform baseline perturbations ($蔚_1 = 0.1$) with targeted enhancements ($蔚_2 = 0.4$). The framework leverages an ensemble of complementary architectures, from VGG to ConvNeXt, synthesizing diverse perspectives through an innovative voting mechanism. Against state-of-the-art architectures, Watertox reduces model accuracy from 70.6% to 16.0%, with zero-shot attacks achieving up to 98.8% accuracy reduction against unseen architectures. These results establish Watertox as a significant advancement in adversarial methodologies, with promising applications in visual security systems and CAPTCHA generation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.15924v1-abstract-full').style.display = 'none'; document.getElementById('2412.15924v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">18 pages, 4 figures, 3 tables. Advances a novel method for generating cross-model transferable adversarial perturbations through a two-stage FGSM process and architectural ensemble voting mechanism</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.15606">arXiv:2412.15606</a> <span> [<a href="https://arxiv.org/pdf/2412.15606">pdf</a>, <a href="https://arxiv.org/format/2412.15606">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Multi-modal Agent Tuning: Building a VLM-Driven Agent for Efficient Tool Usage </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Zhi Gao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Bofei Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+P">Pengxiang Li</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+X">Xiaojian Ma</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+T">Tao Yuan</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+Y">Yue Fan</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yuwei Wu</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+Y">Yunde Jia</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+S">Song-Chun Zhu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Q">Qing Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.15606v2-abstract-short" style="display: inline;"> The advancement of large language models (LLMs) prompts the development of multi-modal agents, which are used as a controller to call external tools, providing a feasible way to solve practical tasks. In this paper, we propose a multi-modal agent tuning method that automatically generates multi-modal tool-usage data and tunes a vision-language model (VLM) as the controller for powerful tool-usage… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.15606v2-abstract-full').style.display = 'inline'; document.getElementById('2412.15606v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.15606v2-abstract-full" style="display: none;"> The advancement of large language models (LLMs) prompts the development of multi-modal agents, which are used as a controller to call external tools, providing a feasible way to solve practical tasks. In this paper, we propose a multi-modal agent tuning method that automatically generates multi-modal tool-usage data and tunes a vision-language model (VLM) as the controller for powerful tool-usage reasoning. To preserve the data quality, we prompt the GPT-4o mini model to generate queries, files, and trajectories, followed by query-file and trajectory verifiers. Based on the data synthesis pipeline, we collect the MM-Traj dataset that contains 20K tasks with trajectories of tool usage. Then, we develop the T3-Agent via \underline{T}rajectory \underline{T}uning on VLMs for \underline{T}ool usage using MM-Traj. Evaluations on the GTA and GAIA benchmarks show that the T3-Agent consistently achieves improvements on two popular VLMs: MiniCPM-V-8.5B and {Qwen2-VL-7B}, which outperforms untrained VLMs by $20\%$, showing the effectiveness of the proposed data synthesis pipeline, leading to high-quality data for tool-usage capabilities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.15606v2-abstract-full').style.display = 'none'; document.getElementById('2412.15606v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICLR 2025, https://mat-agent.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.13877">arXiv:2412.13877</a> <span> [<a href="https://arxiv.org/pdf/2412.13877">pdf</a>, <a href="https://arxiv.org/format/2412.13877">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> RoboMIND: Benchmark on Multi-embodiment Intelligence Normative Data for Robot Manipulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+K">Kun Wu</a>, <a href="/search/cs?searchtype=author&query=Hou%2C+C">Chengkai Hou</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jiaming Liu</a>, <a href="/search/cs?searchtype=author&query=Che%2C+Z">Zhengping Che</a>, <a href="/search/cs?searchtype=author&query=Ju%2C+X">Xiaozhu Ju</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhuqin Yang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Meng Li</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yinuo Zhao</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zhiyuan Xu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+G">Guang Yang</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+S">Shichao Fan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xinhua Wang</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+F">Fei Liao</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Zhen Zhao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Guangyu Li</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+Z">Zhao Jin</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Lecheng Wang</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+J">Jilei Mao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+N">Ning Liu</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+P">Pei Ren</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Qiang Zhang</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+Y">Yaoxu Lyu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+M">Mengzhen Liu</a>, <a href="/search/cs?searchtype=author&query=He%2C+J">Jingyang He</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+Y">Yulin Luo</a> , et al. (12 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.13877v2-abstract-short" style="display: inline;"> In this paper, we introduce RoboMIND (Multi-embodiment Intelligence Normative Data for Robot Manipulation), a dataset containing 107k demonstration trajectories across 479 diverse tasks involving 96 object classes. RoboMIND is collected through human teleoperation and encompasses comprehensive robotic-related information, including multi-view observations, proprioceptive robot state information, a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.13877v2-abstract-full').style.display = 'inline'; document.getElementById('2412.13877v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.13877v2-abstract-full" style="display: none;"> In this paper, we introduce RoboMIND (Multi-embodiment Intelligence Normative Data for Robot Manipulation), a dataset containing 107k demonstration trajectories across 479 diverse tasks involving 96 object classes. RoboMIND is collected through human teleoperation and encompasses comprehensive robotic-related information, including multi-view observations, proprioceptive robot state information, and linguistic task descriptions. To ensure data consistency and reliability for imitation learning, RoboMIND is built on a unified data collection platform and a standardized protocol, covering four distinct robotic embodiments: the Franka Emika Panda, the UR5e, the AgileX dual-arm robot, and a humanoid robot with dual dexterous hands. Our dataset also includes 5k real-world failure demonstrations, each accompanied by detailed causes, enabling failure reflection and correction during policy learning. Additionally, we created a digital twin environment in the Isaac Sim simulator, replicating the real-world tasks and assets, which facilitates the low-cost collection of additional training data and enables efficient evaluation. To demonstrate the quality and diversity of our dataset, we conducted extensive experiments using various imitation learning methods for single-task settings and state-of-the-art Vision-Language-Action (VLA) models for multi-task scenarios. By leveraging RoboMIND, the VLA models achieved high manipulation success rates and demonstrated strong generalization capabilities. To the best of our knowledge, RoboMIND is the largest multi-embodiment teleoperation dataset collected on a unified platform, providing large-scale and high-quality robotic training data. Our project is at https://x-humanoid-robomind.github.io/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.13877v2-abstract-full').style.display = 'none'; document.getElementById('2412.13877v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.12587">arXiv:2412.12587</a> <span> [<a href="https://arxiv.org/pdf/2412.12587">pdf</a>, <a href="https://arxiv.org/format/2412.12587">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> Distributed satellite information networks: Architecture, enabling technologies, and trends </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Qinyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+L">Liang Xu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jianhao Huang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+T">Tao Yang</a>, <a href="/search/cs?searchtype=author&query=Jiao%2C+J">Jian Jiao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Ye Wang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yao Shi</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chiya Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xingjian Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Ke Zhang</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+Y">Yupeng Gong</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+N">Na Deng</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+N">Nan Zhao</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Zhen Gao</a>, <a href="/search/cs?searchtype=author&query=Han%2C+S">Shujun Han</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xiaodong Xu</a>, <a href="/search/cs?searchtype=author&query=You%2C+L">Li You</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+D">Dongming Wang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+S">Shan Jiang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+D">Dixian Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+N">Nan Zhang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+L">Liujun Hu</a>, <a href="/search/cs?searchtype=author&query=He%2C+X">Xiongwen He</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yonghui Li</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+X">Xiqi Gao</a> , et al. (1 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.12587v1-abstract-short" style="display: inline;"> Driven by the vision of ubiquitous connectivity and wireless intelligence, the evolution of ultra-dense constellation-based satellite-integrated Internet is underway, now taking preliminary shape. Nevertheless, the entrenched institutional silos and limited, nonrenewable heterogeneous network resources leave current satellite systems struggling to accommodate the escalating demands of next-generat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.12587v1-abstract-full').style.display = 'inline'; document.getElementById('2412.12587v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.12587v1-abstract-full" style="display: none;"> Driven by the vision of ubiquitous connectivity and wireless intelligence, the evolution of ultra-dense constellation-based satellite-integrated Internet is underway, now taking preliminary shape. Nevertheless, the entrenched institutional silos and limited, nonrenewable heterogeneous network resources leave current satellite systems struggling to accommodate the escalating demands of next-generation intelligent applications. In this context, the distributed satellite information networks (DSIN), exemplified by the cohesive clustered satellites system, have emerged as an innovative architecture, bridging information gaps across diverse satellite systems, such as communication, navigation, and remote sensing, and establishing a unified, open information network paradigm to support resilient space information services. This survey first provides a profound discussion about innovative network architectures of DSIN, encompassing distributed regenerative satellite network architecture, distributed satellite computing network architecture, and reconfigurable satellite formation flying, to enable flexible and scalable communication, computing and control. The DSIN faces challenges from network heterogeneity, unpredictable channel dynamics, sparse resources, and decentralized collaboration frameworks. To address these issues, a series of enabling technologies is identified, including channel modeling and estimation, cloud-native distributed MIMO cooperation, grant-free massive access, network routing, and the proper combination of all these diversity techniques. Furthermore, to heighten the overall resource efficiency, the cross-layer optimization techniques are further developed to meet upper-layer deterministic, adaptive and secure information services requirements. In addition, emerging research directions and new opportunities are highlighted on the way to achieving the DSIN vision. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.12587v1-abstract-full').style.display = 'none'; document.getElementById('2412.12587v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.11907">arXiv:2412.11907</a> <span> [<a href="https://arxiv.org/pdf/2412.11907">pdf</a>, <a href="https://arxiv.org/format/2412.11907">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> AudioCIL: A Python Toolbox for Audio Class-Incremental Learning with Multiple Scenes </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+Q">Qisheng Xu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yulin Sun</a>, <a href="/search/cs?searchtype=author&query=Su%2C+Y">Yi Su</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Q">Qian Zhu</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+X">Xiaoyi Tan</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+H">Hongyu Wen</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Zijian Gao</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+K">Kele Xu</a>, <a href="/search/cs?searchtype=author&query=Dou%2C+Y">Yong Dou</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+D">Dawei Feng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.11907v2-abstract-short" style="display: inline;"> Deep learning, with its robust aotomatic feature extraction capabilities, has demonstrated significant success in audio signal processing. Typically, these methods rely on static, pre-collected large-scale datasets for training, performing well on a fixed number of classes. However, the real world is characterized by constant change, with new audio classes emerging from streaming or temporary avai… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.11907v2-abstract-full').style.display = 'inline'; document.getElementById('2412.11907v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.11907v2-abstract-full" style="display: none;"> Deep learning, with its robust aotomatic feature extraction capabilities, has demonstrated significant success in audio signal processing. Typically, these methods rely on static, pre-collected large-scale datasets for training, performing well on a fixed number of classes. However, the real world is characterized by constant change, with new audio classes emerging from streaming or temporary availability due to privacy. This dynamic nature of audio environments necessitates models that can incrementally learn new knowledge for new classes without discarding existing information. Introducing incremental learning to the field of audio signal processing, i.e., Audio Class-Incremental Learning (AuCIL), is a meaningful endeavor. We propose such a toolbox named AudioCIL to align audio signal processing algorithms with real-world scenarios and strengthen research in audio class-incremental learning. Code is available at https://github.com/colaudiolab/AudioCIL. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.11907v2-abstract-full').style.display = 'none'; document.getElementById('2412.11907v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.10117">arXiv:2412.10117</a> <span> [<a href="https://arxiv.org/pdf/2412.10117">pdf</a>, <a href="https://arxiv.org/format/2412.10117">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> CosyVoice 2: Scalable Streaming Speech Synthesis with Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Du%2C+Z">Zhihao Du</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yuxuan Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Q">Qian Chen</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+X">Xian Shi</a>, <a href="/search/cs?searchtype=author&query=Lv%2C+X">Xiang Lv</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+T">Tianyu Zhao</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Zhifu Gao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yexin Yang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+C">Changfeng Gao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hui Wang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+F">Fan Yu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Huadai Liu</a>, <a href="/search/cs?searchtype=author&query=Sheng%2C+Z">Zhengyan Sheng</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+Y">Yue Gu</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+C">Chong Deng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wen Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shiliang Zhang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+Z">Zhijie Yan</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jingren Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.10117v3-abstract-short" style="display: inline;"> In our previous work, we introduced CosyVoice, a multilingual speech synthesis model based on supervised discrete speech tokens. By employing progressive semantic decoding with two popular generative models, language models (LMs) and Flow Matching, CosyVoice demonstrated high prosody naturalness, content consistency, and speaker similarity in speech in-context learning. Recently, significant progr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.10117v3-abstract-full').style.display = 'inline'; document.getElementById('2412.10117v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.10117v3-abstract-full" style="display: none;"> In our previous work, we introduced CosyVoice, a multilingual speech synthesis model based on supervised discrete speech tokens. By employing progressive semantic decoding with two popular generative models, language models (LMs) and Flow Matching, CosyVoice demonstrated high prosody naturalness, content consistency, and speaker similarity in speech in-context learning. Recently, significant progress has been made in multi-modal large language models (LLMs), where the response latency and real-time factor of speech synthesis play a crucial role in the interactive experience. Therefore, in this report, we present an improved streaming speech synthesis model, CosyVoice 2, which incorporates comprehensive and systematic optimizations. Specifically, we introduce finite-scalar quantization to improve the codebook utilization of speech tokens. For the text-speech LM, we streamline the model architecture to allow direct use of a pre-trained LLM as the backbone. In addition, we develop a chunk-aware causal flow matching model to support various synthesis scenarios, enabling both streaming and non-streaming synthesis within a single model. By training on a large-scale multilingual dataset, CosyVoice 2 achieves human-parity naturalness, minimal response latency, and virtually lossless synthesis quality in the streaming mode. We invite readers to listen to the demos at https://funaudiollm.github.io/cosyvoice2. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.10117v3-abstract-full').style.display = 'none'; document.getElementById('2412.10117v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Tech report, work in progress</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.09951">arXiv:2412.09951</a> <span> [<a href="https://arxiv.org/pdf/2412.09951">pdf</a>, <a href="https://arxiv.org/format/2412.09951">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> WiseAD: Knowledge Augmented End-to-End Autonomous Driving with Vision-Language Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Songyan Zhang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+W">Wenhui Huang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Zihui Gao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hao Chen</a>, <a href="/search/cs?searchtype=author&query=Lv%2C+C">Chen Lv</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.09951v2-abstract-short" style="display: inline;"> The emergence of general human knowledge and impressive logical reasoning capacity in rapidly progressed vision-language models (VLMs) have driven increasing interest in applying VLMs to high-level autonomous driving tasks, such as scene understanding and decision-making. However, an in-depth study on the relationship between knowledge proficiency, especially essential driving expertise, and close… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.09951v2-abstract-full').style.display = 'inline'; document.getElementById('2412.09951v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.09951v2-abstract-full" style="display: none;"> The emergence of general human knowledge and impressive logical reasoning capacity in rapidly progressed vision-language models (VLMs) have driven increasing interest in applying VLMs to high-level autonomous driving tasks, such as scene understanding and decision-making. However, an in-depth study on the relationship between knowledge proficiency, especially essential driving expertise, and closed-loop autonomous driving performance requires further exploration. In this paper, we investigate the effects of the depth and breadth of fundamental driving knowledge on closed-loop trajectory planning and introduce WiseAD, a specialized VLM tailored for end-to-end autonomous driving capable of driving reasoning, action justification, object recognition, risk analysis, driving suggestions, and trajectory planning across diverse scenarios. We employ joint training on driving knowledge and planning datasets, enabling the model to perform knowledge-aligned trajectory planning accordingly. Extensive experiments indicate that as the diversity of driving knowledge extends, critical accidents are notably reduced, contributing 11.9% and 12.4% improvements in the driving score and route completion on the Carla closed-loop evaluations, achieving state-of-the-art performance. Moreover, WiseAD also demonstrates remarkable performance in knowledge evaluations on both in-domain and out-of-domain datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.09951v2-abstract-full').style.display = 'none'; document.getElementById('2412.09951v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.08221">arXiv:2412.08221</a> <span> [<a href="https://arxiv.org/pdf/2412.08221">pdf</a>, <a href="https://arxiv.org/format/2412.08221">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Generate Any Scene: Evaluating and Improving Text-to-Vision Generation with Scene Graph Programming </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Ziqi Gao</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+W">Weikai Huang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jieyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Kembhavi%2C+A">Aniruddha Kembhavi</a>, <a href="/search/cs?searchtype=author&query=Krishna%2C+R">Ranjay Krishna</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.08221v2-abstract-short" style="display: inline;"> DALL-E and Sora have gained attention by producing implausible images, such as "astronauts riding a horse in space." Despite the proliferation of text-to-vision models that have inundated the internet with synthetic visuals, from images to 3D assets, current benchmarks predominantly evaluate these models on real-world scenes paired with captions. We introduce Generate Any Scene, a framework that s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.08221v2-abstract-full').style.display = 'inline'; document.getElementById('2412.08221v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.08221v2-abstract-full" style="display: none;"> DALL-E and Sora have gained attention by producing implausible images, such as "astronauts riding a horse in space." Despite the proliferation of text-to-vision models that have inundated the internet with synthetic visuals, from images to 3D assets, current benchmarks predominantly evaluate these models on real-world scenes paired with captions. We introduce Generate Any Scene, a framework that systematically enumerates scene graphs representing a vast array of visual scenes, spanning realistic to imaginative compositions. Generate Any Scene leverages 'scene graph programming', a method for dynamically constructing scene graphs of varying complexity from a structured taxonomy of visual elements. This taxonomy includes numerous objects, attributes, and relations, enabling the synthesis of an almost infinite variety of scene graphs. Using these structured representations, Generate Any Scene translates each scene graph into a caption, enabling scalable evaluation of text-to-vision models through standard metrics. We conduct extensive evaluations across multiple text-to-image, text-to-video, and text-to-3D models, presenting key findings on model performance. We find that DiT-backbone text-to-image models align more closely with input captions than UNet-backbone models. Text-to-video models struggle with balancing dynamics and consistency, while both text-to-video and text-to-3D models show notable gaps in human preference alignment. We demonstrate the effectiveness of Generate Any Scene by conducting three practical applications leveraging captions generated by Generate Any Scene: 1) a self-improving framework where models iteratively enhance their performance using generated data, 2) a distillation process to transfer specific strengths from proprietary models to open-source counterparts, and 3) improvements in content moderation by identifying and generating challenging synthetic data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.08221v2-abstract-full').style.display = 'none'; document.getElementById('2412.08221v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.07819">arXiv:2412.07819</a> <span> [<a href="https://arxiv.org/pdf/2412.07819">pdf</a>, <a href="https://arxiv.org/format/2412.07819">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Intelligent System for Automated Molecular Patent Infringement Assessment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yaorui Shi</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Sihang Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Taiyan Zhang</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+X">Xi Fang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jiankun Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhiyuan Liu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+G">Guojiang Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Z">Zhengdan Zhu</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Zhifeng Gao</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+R">Renxin Zhong</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Linfeng Zhang</a>, <a href="/search/cs?searchtype=author&query=Ke%2C+G">Guolin Ke</a>, <a href="/search/cs?searchtype=author&query=E%2C+W">Weinan E</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+H">Hengxing Cai</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiang Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.07819v2-abstract-short" style="display: inline;"> Automated drug discovery offers significant potential for accelerating the development of novel therapeutics by substituting labor-intensive human workflows with machine-driven processes. However, molecules generated by artificial intelligence may unintentionally infringe on existing patents, posing legal and financial risks that impede the full automation of drug discovery pipelines. This paper i… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.07819v2-abstract-full').style.display = 'inline'; document.getElementById('2412.07819v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.07819v2-abstract-full" style="display: none;"> Automated drug discovery offers significant potential for accelerating the development of novel therapeutics by substituting labor-intensive human workflows with machine-driven processes. However, molecules generated by artificial intelligence may unintentionally infringe on existing patents, posing legal and financial risks that impede the full automation of drug discovery pipelines. This paper introduces PatentFinder, a novel multi-agent and tool-enhanced intelligence system that can accurately and comprehensively evaluate small molecules for patent infringement. PatentFinder features five specialized agents that collaboratively analyze patent claims and molecular structures with heuristic and model-based tools, generating interpretable infringement reports. To support systematic evaluation, we curate MolPatent-240, a benchmark dataset tailored for patent infringement assessment algorithms. On this benchmark, PatentFinder outperforms baseline methods that rely solely on large language models or specialized chemical tools, achieving a 13.8% improvement in F1-score and a 12% increase in accuracy. Additionally, PatentFinder autonomously generates detailed and interpretable patent infringement reports, showcasing enhanced accuracy and improved interpretability. The high accuracy and interpretability of PatentFinder make it a valuable and reliable tool for automating patent infringement assessments, offering a practical solution for integrating patent protection analysis into the drug discovery pipeline. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.07819v2-abstract-full').style.display = 'none'; document.getElementById('2412.07819v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.05271">arXiv:2412.05271</a> <span> [<a href="https://arxiv.org/pdf/2412.05271">pdf</a>, <a href="https://arxiv.org/format/2412.05271">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Expanding Performance Boundaries of Open-Source Multimodal Models with Model, Data, and Test-Time Scaling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhe Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Weiyun Wang</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+Y">Yue Cao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yangzhou Liu</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Zhangwei Gao</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+E">Erfei Cui</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jinguo Zhu</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+S">Shenglong Ye</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+H">Hao Tian</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhaoyang Liu</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+L">Lixin Gu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xuehui Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Q">Qingyun Li</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+Y">Yimin Ren</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zixuan Chen</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+J">Jiapeng Luo</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jiahao Wang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+T">Tan Jiang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bo Wang</a>, <a href="/search/cs?searchtype=author&query=He%2C+C">Conghui He</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+B">Botian Shi</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xingcheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Lv%2C+H">Han Lv</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yi Wang</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+W">Wenqi Shao</a> , et al. (17 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.05271v4-abstract-short" style="display: inline;"> We introduce InternVL 2.5, an advanced multimodal large language model (MLLM) series that builds upon InternVL 2.0, maintaining its core model architecture while introducing significant enhancements in training and testing strategies as well as data quality. In this work, we delve into the relationship between model scaling and performance, systematically exploring the performance trends in vision… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.05271v4-abstract-full').style.display = 'inline'; document.getElementById('2412.05271v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.05271v4-abstract-full" style="display: none;"> We introduce InternVL 2.5, an advanced multimodal large language model (MLLM) series that builds upon InternVL 2.0, maintaining its core model architecture while introducing significant enhancements in training and testing strategies as well as data quality. In this work, we delve into the relationship between model scaling and performance, systematically exploring the performance trends in vision encoders, language models, dataset sizes, and test-time configurations. Through extensive evaluations on a wide range of benchmarks, including multi-discipline reasoning, document understanding, multi-image / video understanding, real-world comprehension, multimodal hallucination detection, visual grounding, multilingual capabilities, and pure language processing, InternVL 2.5 exhibits competitive performance, rivaling leading commercial models such as GPT-4o and Claude-3.5-Sonnet. Notably, our model is the first open-source MLLMs to surpass 70% on the MMMU benchmark, achieving a 3.7-point improvement through Chain-of-Thought (CoT) reasoning and showcasing strong potential for test-time scaling. We hope this model contributes to the open-source community by setting new standards for developing and applying multimodal AI systems. HuggingFace demo see https://huggingface.co/spaces/OpenGVLab/InternVL <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.05271v4-abstract-full').style.display = 'none'; document.getElementById('2412.05271v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Technical Report</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.03936">arXiv:2412.03936</a> <span> [<a href="https://arxiv.org/pdf/2412.03936">pdf</a>, <a href="https://arxiv.org/format/2412.03936">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Deep Learning Modeling Method for RF Devices Based on Uniform Noise Training Set </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+Z">Zhaokun Hu</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+Y">Yindong Xiao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Houjun Wang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+J">Jiayong Yu</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Zihang Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.03936v1-abstract-short" style="display: inline;"> As the scale and complexity of integrated circuits continue to increase, traditional modeling methods are struggling to address the nonlinear challenges in radio frequency (RF) chips. Deep learning has been increasingly applied to RF device modeling. This paper proposes a deep learning-based modeling method for RF devices using a uniform noise training set, aimed at modeling and fitting the nonlin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.03936v1-abstract-full').style.display = 'inline'; document.getElementById('2412.03936v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.03936v1-abstract-full" style="display: none;"> As the scale and complexity of integrated circuits continue to increase, traditional modeling methods are struggling to address the nonlinear challenges in radio frequency (RF) chips. Deep learning has been increasingly applied to RF device modeling. This paper proposes a deep learning-based modeling method for RF devices using a uniform noise training set, aimed at modeling and fitting the nonlinear characteristics of RF devices. We hypothesize that a uniform noise signal can encompass the full range of characteristics across both frequency and amplitude, and that a deep learning model can effectively capture and learn these features. Based on this hypothesis, the paper designs a complete integrated circuit modeling process based on measured data, including data collection, processing, and neural network training. The proposed method is experimentally validated using the RF amplifier PW210 as a case study. Experimental results show that the uniform noise training set allows the model to capture the nonlinear characteristics of RF devices, and the trained model can predict waveform patterns it has never encountered before. The proposed deep learning-based RF device modeling method, using a uniform noise training set, demonstrates strong generalization capability and excellent training performance, offering high practical application value. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.03936v1-abstract-full').style.display = 'none'; document.getElementById('2412.03936v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages,11 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.01936">arXiv:2412.01936</a> <span> [<a href="https://arxiv.org/pdf/2412.01936">pdf</a>, <a href="https://arxiv.org/format/2412.01936">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> </div> </div> <p class="title is-5 mathjax"> Kernel-Free Universum Quadratic Surface Twin Support Vector Machines for Imbalanced Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Moosaei%2C+H">Hossein Moosaei</a>, <a href="/search/cs?searchtype=author&query=Hlad%C3%ADk%2C+M">Milan Hlad铆k</a>, <a href="/search/cs?searchtype=author&query=Mousavi%2C+A">Ahmad Mousavi</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Zheming Gao</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+H">Haojie Fu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.01936v1-abstract-short" style="display: inline;"> Binary classification tasks with imbalanced classes pose significant challenges in machine learning. Traditional classifiers often struggle to accurately capture the characteristics of the minority class, resulting in biased models with subpar predictive performance. In this paper, we introduce a novel approach to tackle this issue by leveraging Universum points to support the minority class withi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.01936v1-abstract-full').style.display = 'inline'; document.getElementById('2412.01936v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.01936v1-abstract-full" style="display: none;"> Binary classification tasks with imbalanced classes pose significant challenges in machine learning. Traditional classifiers often struggle to accurately capture the characteristics of the minority class, resulting in biased models with subpar predictive performance. In this paper, we introduce a novel approach to tackle this issue by leveraging Universum points to support the minority class within quadratic twin support vector machine models. Unlike traditional classifiers, our models utilize quadratic surfaces instead of hyperplanes for binary classification, providing greater flexibility in modeling complex decision boundaries. By incorporating Universum points, our approach enhances classification accuracy and generalization performance on imbalanced datasets. We generated four artificial datasets to demonstrate the flexibility of the proposed methods. Additionally, we validated the effectiveness of our approach through empirical evaluations on benchmark datasets, showing superior performance compared to conventional classifiers and existing methods for imbalanced classification. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.01936v1-abstract-full').style.display = 'none'; document.getElementById('2412.01936v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.19041">arXiv:2411.19041</a> <span> [<a href="https://arxiv.org/pdf/2411.19041">pdf</a>, <a href="https://arxiv.org/format/2411.19041">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> TAMT: Temporal-Aware Model Tuning for Cross-Domain Few-Shot Action Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yilong Wang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Zilin Gao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qilong Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhaofeng Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+P">Peihua Li</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Q">Qinghua Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.19041v1-abstract-short" style="display: inline;"> Going beyond few-shot action recognition (FSAR), cross-domain FSAR (CDFSAR) has attracted recent research interests by solving the domain gap lying in source-to-target transfer learning. Existing CDFSAR methods mainly focus on joint training of source and target data to mitigate the side effect of domain gap. However, such kind of methods suffer from two limitations: First, pair-wise joint trainin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.19041v1-abstract-full').style.display = 'inline'; document.getElementById('2411.19041v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.19041v1-abstract-full" style="display: none;"> Going beyond few-shot action recognition (FSAR), cross-domain FSAR (CDFSAR) has attracted recent research interests by solving the domain gap lying in source-to-target transfer learning. Existing CDFSAR methods mainly focus on joint training of source and target data to mitigate the side effect of domain gap. However, such kind of methods suffer from two limitations: First, pair-wise joint training requires retraining deep models in case of one source data and multiple target ones, which incurs heavy computation cost, especially for large source and small target data. Second, pre-trained models after joint training are adopted to target domain in a straightforward manner, hardly taking full potential of pre-trained models and then limiting recognition performance. To overcome above limitations, this paper proposes a simple yet effective baseline, namely Temporal-Aware Model Tuning (TAMT) for CDFSAR. Specifically, our TAMT involves a decoupled paradigm by performing pre-training on source data and fine-tuning target data, which avoids retraining for multiple target data with single source. To effectively and efficiently explore the potential of pre-trained models in transferring to target domain, our TAMT proposes a Hierarchical Temporal Tuning Network (HTTN), whose core involves local temporal-aware adapters (TAA) and a global temporal-aware moment tuning (GTMT). Particularly, TAA learns few parameters to recalibrate the intermediate features of frozen pre-trained models, enabling efficient adaptation to target domains. Furthermore, GTMT helps to generate powerful video representations, improving match performance on the target domain. Experiments on several widely used video benchmarks show our TAMT outperforms the recently proposed counterparts by 13%$\sim$31%, achieving new state-of-the-art CDFSAR results. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.19041v1-abstract-full').style.display = 'none'; document.getElementById('2411.19041v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.18630">arXiv:2411.18630</a> <span> [<a href="https://arxiv.org/pdf/2411.18630">pdf</a>, <a href="https://arxiv.org/format/2411.18630">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Volume Rendering of Human Hand Anatomy </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jingtao Huang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bohan Wang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Zhiyuan Gao</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+M">Mianlun Zheng</a>, <a href="/search/cs?searchtype=author&query=Matcuk%2C+G">George Matcuk</a>, <a href="/search/cs?searchtype=author&query=Barbic%2C+J">Jernej Barbic</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.18630v1-abstract-short" style="display: inline;"> We study the design of transfer functions for volumetric rendering of magnetic resonance imaging (MRI) datasets of human hands. Human hands are anatomically complex, containing various organs within a limited space, which presents challenges for volumetric rendering. We focus on hand musculoskeletal organs because they are volumetrically the largest inside the hand, and most important for the hand… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18630v1-abstract-full').style.display = 'inline'; document.getElementById('2411.18630v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.18630v1-abstract-full" style="display: none;"> We study the design of transfer functions for volumetric rendering of magnetic resonance imaging (MRI) datasets of human hands. Human hands are anatomically complex, containing various organs within a limited space, which presents challenges for volumetric rendering. We focus on hand musculoskeletal organs because they are volumetrically the largest inside the hand, and most important for the hand's main function, namely manipulation of objects. While volumetric rendering is a mature field, the choice of the transfer function for the different organs is arguably just as important as the choice of the specific volume rendering algorithm; we demonstrate that it significantly influences the clarity and interpretability of the resulting images. We assume that the hand MRI scans have already been segmented into the different organs (bones, muscles, tendons, ligaments, subcutaneous fat, etc.). Our method uses the hand MRI volume data, and the geometry of its inner organs and their known segmentation, to produce high-quality volume rendering images of the hand, and permits fine control over the appearance of each tissue. We contribute two families of transfer functions to emphasize different hand tissues of interest, while preserving the visual context of the hand. We also discuss and reduce artifacts present in standard volume ray-casting of human hands. We evaluate our volumetric rendering on five challenging hand motion sequences. Our experimental results demonstrate that our method improves hand anatomy visualization, compared to standard surface and volume rendering techniques. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18630v1-abstract-full').style.display = 'none'; document.getElementById('2411.18630v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.3.7 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.18018">arXiv:2411.18018</a> <span> [<a href="https://arxiv.org/pdf/2411.18018">pdf</a>, <a href="https://arxiv.org/format/2411.18018">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Neural Finite-State Machines for Surgical Phase Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ding%2C+H">Hao Ding</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Zhongpai Gao</a>, <a href="/search/cs?searchtype=author&query=Planche%2C+B">Benjamin Planche</a>, <a href="/search/cs?searchtype=author&query=Luan%2C+T">Tianyu Luan</a>, <a href="/search/cs?searchtype=author&query=Sharma%2C+A">Abhishek Sharma</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+M">Meng Zheng</a>, <a href="/search/cs?searchtype=author&query=Lou%2C+A">Ange Lou</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Terrence Chen</a>, <a href="/search/cs?searchtype=author&query=Unberath%2C+M">Mathias Unberath</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Ziyan Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.18018v1-abstract-short" style="display: inline;"> Surgical phase recognition is essential for analyzing procedure-specific surgical videos. While recent transformer-based architectures have advanced sequence processing capabilities, they struggle with maintaining consistency across lengthy surgical procedures. Drawing inspiration from classical hidden Markov models' finite-state interpretations, we introduce the neural finite-state machine (NFSM)… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18018v1-abstract-full').style.display = 'inline'; document.getElementById('2411.18018v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.18018v1-abstract-full" style="display: none;"> Surgical phase recognition is essential for analyzing procedure-specific surgical videos. While recent transformer-based architectures have advanced sequence processing capabilities, they struggle with maintaining consistency across lengthy surgical procedures. Drawing inspiration from classical hidden Markov models' finite-state interpretations, we introduce the neural finite-state machine (NFSM) module, which bridges procedural understanding with deep learning approaches. NFSM combines procedure-level understanding with neural networks through global state embeddings, attention-based dynamic transition tables, and transition-aware training and inference mechanisms for offline and online applications. When integrated into our future-aware architecture, NFSM improves video-level accuracy, phase-level precision, recall, and Jaccard indices on Cholec80 datasets by 2.3, 3.2, 3.0, and 4.8 percentage points respectively. As an add-on module to existing state-of-the-art models like Surgformer, NFSM further enhances performance, demonstrating its complementary value. Extended experiments on non-surgical datasets validate NFSM's generalizability beyond surgical domains. Comprehensive experiments demonstrate that incorporating NSFM into deep learning frameworks enables more robust and consistent phase recognition across long procedural videos. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18018v1-abstract-full').style.display = 'none'; document.getElementById('2411.18018v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.16932">arXiv:2411.16932</a> <span> [<a href="https://arxiv.org/pdf/2411.16932">pdf</a>, <a href="https://arxiv.org/format/2411.16932">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Seq2Time: Sequential Knowledge Transfer for Video LLM Temporal Grounding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Deng%2C+A">Andong Deng</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Zhongpai Gao</a>, <a href="/search/cs?searchtype=author&query=Choudhuri%2C+A">Anwesa Choudhuri</a>, <a href="/search/cs?searchtype=author&query=Planche%2C+B">Benjamin Planche</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+M">Meng Zheng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bin Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Terrence Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C">Chen Chen</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Ziyan Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.16932v1-abstract-short" style="display: inline;"> Temporal awareness is essential for video large language models (LLMs) to understand and reason about events within long videos, enabling applications like dense video captioning and temporal video grounding in a unified system. However, the scarcity of long videos with detailed captions and precise temporal annotations limits their temporal awareness. In this paper, we propose Seq2Time, a data-or… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16932v1-abstract-full').style.display = 'inline'; document.getElementById('2411.16932v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.16932v1-abstract-full" style="display: none;"> Temporal awareness is essential for video large language models (LLMs) to understand and reason about events within long videos, enabling applications like dense video captioning and temporal video grounding in a unified system. However, the scarcity of long videos with detailed captions and precise temporal annotations limits their temporal awareness. In this paper, we propose Seq2Time, a data-oriented training paradigm that leverages sequences of images and short video clips to enhance temporal awareness in long videos. By converting sequence positions into temporal annotations, we transform large-scale image and clip captioning datasets into sequences that mimic the temporal structure of long videos, enabling self-supervised training with abundant time-sensitive data. To enable sequence-to-time knowledge transfer, we introduce a novel time representation that unifies positional information across image sequences, clip sequences, and long videos. Experiments demonstrate the effectiveness of our method, achieving a 27.6% improvement in F1 score and 44.8% in CIDEr on the YouCook2 benchmark and a 14.7% increase in recall on the Charades-STA benchmark compared to the baseline. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16932v1-abstract-full').style.display = 'none'; document.getElementById('2411.16932v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.16681">arXiv:2411.16681</a> <span> [<a href="https://arxiv.org/pdf/2411.16681">pdf</a>, <a href="https://arxiv.org/format/2411.16681">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Factorized Visual Tokenization and Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bai%2C+Z">Zechen Bai</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+J">Jianxiong Gao</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Ziteng Gao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+P">Pichao Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zheng Zhang</a>, <a href="/search/cs?searchtype=author&query=He%2C+T">Tong He</a>, <a href="/search/cs?searchtype=author&query=Shou%2C+M+Z">Mike Zheng Shou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.16681v2-abstract-short" style="display: inline;"> Visual tokenizers are fundamental to image generation. They convert visual data into discrete tokens, enabling transformer-based models to excel at image generation. Despite their success, VQ-based tokenizers like VQGAN face significant limitations due to constrained vocabulary sizes. Simply expanding the codebook often leads to training instability and diminishing performance gains, making scalab… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16681v2-abstract-full').style.display = 'inline'; document.getElementById('2411.16681v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.16681v2-abstract-full" style="display: none;"> Visual tokenizers are fundamental to image generation. They convert visual data into discrete tokens, enabling transformer-based models to excel at image generation. Despite their success, VQ-based tokenizers like VQGAN face significant limitations due to constrained vocabulary sizes. Simply expanding the codebook often leads to training instability and diminishing performance gains, making scalability a critical challenge. In this work, we introduce Factorized Quantization (FQ), a novel approach that revitalizes VQ-based tokenizers by decomposing a large codebook into multiple independent sub-codebooks. This factorization reduces the lookup complexity of large codebooks, enabling more efficient and scalable visual tokenization. To ensure each sub-codebook captures distinct and complementary information, we propose a disentanglement regularization that explicitly reduces redundancy, promoting diversity across the sub-codebooks. Furthermore, we integrate representation learning into the training process, leveraging pretrained vision models like CLIP and DINO to infuse semantic richness into the learned representations. This design ensures our tokenizer captures diverse semantic levels, leading to more expressive and disentangled representations. Experiments show that the proposed FQGAN model substantially improves the reconstruction quality of visual tokenizers, achieving state-of-the-art performance. We further demonstrate that this tokenizer can be effectively adapted into auto-regressive image generation. https://showlab.github.io/FQGAN <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16681v2-abstract-full').style.display = 'none'; document.getElementById('2411.16681v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13076">arXiv:2411.13076</a> <span> [<a href="https://arxiv.org/pdf/2411.13076">pdf</a>, <a href="https://arxiv.org/format/2411.13076">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Hints of Prompt: Enhancing Visual Representation for Multimodal LLMs in Autonomous Driving </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+H">Hao Zhou</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Zhanning Gao</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+M">Maosheng Ye</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhili Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Q">Qifeng Chen</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+T">Tongyi Cao</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+H">Honggang Qi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13076v1-abstract-short" style="display: inline;"> In light of the dynamic nature of autonomous driving environments and stringent safety requirements, general MLLMs combined with CLIP alone often struggle to represent driving-specific scenarios accurately, particularly in complex interactions and long-tail cases. To address this, we propose the Hints of Prompt (HoP) framework, which introduces three key enhancements: Affinity hint to emphasize in… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13076v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13076v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13076v1-abstract-full" style="display: none;"> In light of the dynamic nature of autonomous driving environments and stringent safety requirements, general MLLMs combined with CLIP alone often struggle to represent driving-specific scenarios accurately, particularly in complex interactions and long-tail cases. To address this, we propose the Hints of Prompt (HoP) framework, which introduces three key enhancements: Affinity hint to emphasize instance-level structure by strengthening token-wise connections, Semantic hint to incorporate high-level information relevant to driving-specific cases, such as complex interactions among vehicles and traffic signs, and Question hint to align visual features with the query context, focusing on question-relevant regions. These hints are fused through a Hint Fusion module, enriching visual representations and enhancing multimodal reasoning for autonomous driving VQA tasks. Extensive experiments confirm the effectiveness of the HoP framework, showing it significantly outperforms previous state-of-the-art methods across all key metrics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13076v1-abstract-full').style.display = 'none'; document.getElementById('2411.13076v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11916">arXiv:2411.11916</a> <span> [<a href="https://arxiv.org/pdf/2411.11916">pdf</a>, <a href="https://arxiv.org/format/2411.11916">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> </div> </div> <p class="title is-5 mathjax"> From Words to Structured Visuals: A Benchmark and Framework for Text-to-Diagram Generation and Editing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wei%2C+J">Jingxuan Wei</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+C">Cheng Tan</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Q">Qi Chen</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+G">Gaowei Wu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Siyuan Li</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Zhangyang Gao</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+L">Linzhuang Sun</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+B">Bihui Yu</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+R">Ruifeng Guo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11916v1-abstract-short" style="display: inline;"> We introduce the task of text-to-diagram generation, which focuses on creating structured visual representations directly from textual descriptions. Existing approaches in text-to-image and text-to-code generation lack the logical organization and flexibility needed to produce accurate, editable diagrams, often resulting in outputs that are either unstructured or difficult to modify. To address th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11916v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11916v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11916v1-abstract-full" style="display: none;"> We introduce the task of text-to-diagram generation, which focuses on creating structured visual representations directly from textual descriptions. Existing approaches in text-to-image and text-to-code generation lack the logical organization and flexibility needed to produce accurate, editable diagrams, often resulting in outputs that are either unstructured or difficult to modify. To address this gap, we introduce DiagramGenBenchmark, a comprehensive evaluation framework encompassing eight distinct diagram categories, including flowcharts, model architecture diagrams, and mind maps. Additionally, we present DiagramAgent, an innovative framework with four core modules-Plan Agent, Code Agent, Check Agent, and Diagram-to-Code Agent-designed to facilitate both the generation and refinement of complex diagrams. Our extensive experiments, which combine objective metrics with human evaluations, demonstrate that DiagramAgent significantly outperforms existing baseline models in terms of accuracy, structural coherence, and modifiability. This work not only establishes a foundational benchmark for the text-to-diagram generation task but also introduces a powerful toolset to advance research and applications in this emerging area. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11916v1-abstract-full').style.display = 'none'; document.getElementById('2411.11916v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11357">arXiv:2411.11357</a> <span> [<a href="https://arxiv.org/pdf/2411.11357">pdf</a>, <a href="https://arxiv.org/format/2411.11357">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Text-guided Zero-Shot Object Localization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jingjing Wang</a>, <a href="/search/cs?searchtype=author&query=Piao%2C+X">Xinglin Piao</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Zongzhi Gao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Bo Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yong Zhang</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+B">Baocai Yin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11357v1-abstract-short" style="display: inline;"> Object localization is a hot issue in computer vision area, which aims to identify and determine the precise location of specific objects from image or video. Most existing object localization methods heavily rely on extensive labeled data, which are costly to annotate and constrain their applicability. Therefore, we propose a new Zero-Shot Object Localization (ZSOL) framework for addressing the a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11357v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11357v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11357v1-abstract-full" style="display: none;"> Object localization is a hot issue in computer vision area, which aims to identify and determine the precise location of specific objects from image or video. Most existing object localization methods heavily rely on extensive labeled data, which are costly to annotate and constrain their applicability. Therefore, we propose a new Zero-Shot Object Localization (ZSOL) framework for addressing the aforementioned challenges. In the proposed framework, we introduce the Contrastive Language Image Pre-training (CLIP) module which could integrate visual and linguistic information effectively. Furthermore, we design a Text Self-Similarity Matching (TSSM) module, which could improve the localization accuracy by enhancing the representation of text features extracted by CLIP module. Hence, the proposed framework can be guided by prompt words to identify and locate specific objects in an image in the absence of labeled samples. The results of extensive experiments demonstrate that the proposed method could improve the localization performance significantly and establishes an effective benchmark for further research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11357v1-abstract-full').style.display = 'none'; document.getElementById('2411.11357v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10442">arXiv:2411.10442</a> <span> [<a href="https://arxiv.org/pdf/2411.10442">pdf</a>, <a href="https://arxiv.org/format/2411.10442">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Enhancing the Reasoning Ability of Multimodal Large Language Models via Mixed Preference Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+W">Weiyun Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhe Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wenhai Wang</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+Y">Yue Cao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yangzhou Liu</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Zhangwei Gao</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jinguo Zhu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+X">Xizhou Zhu</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+L">Lewei Lu</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+Y">Yu Qiao</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+J">Jifeng Dai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10442v1-abstract-short" style="display: inline;"> Existing open-source multimodal large language models (MLLMs) generally follow a training process involving pre-training and supervised fine-tuning. However, these models suffer from distribution shifts, which limit their multimodal reasoning, particularly in the Chain-of-Thought (CoT) performance. To address this, we introduce a preference optimization (PO) process to enhance the multimodal reaso… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10442v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10442v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10442v1-abstract-full" style="display: none;"> Existing open-source multimodal large language models (MLLMs) generally follow a training process involving pre-training and supervised fine-tuning. However, these models suffer from distribution shifts, which limit their multimodal reasoning, particularly in the Chain-of-Thought (CoT) performance. To address this, we introduce a preference optimization (PO) process to enhance the multimodal reasoning capabilities of MLLMs. Specifically, (1) on the data side, we design an automated preference data construction pipeline to create MMPR, a high-quality, large-scale multimodal reasoning preference dataset. and (2) on the model side, we explore integrating PO with MLLMs, developing a simple yet effective method, termed Mixed Preference Optimization (MPO), which boosts multimodal CoT performance. Our approach demonstrates improved performance across multiple benchmarks, particularly in multimodal reasoning tasks. Notably, our model, InternVL2-8B-MPO, achieves an accuracy of 67.0 on MathVista, outperforming InternVL2-8B by 8.7 points and achieving performance comparable to the 10x larger InternVL2-76B. We hope this study could inspire further advancements in MLLMs. Code, data, and model shall be publicly released. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10442v1-abstract-full').style.display = 'none'; document.getElementById('2411.10442v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10321">arXiv:2411.10321</a> <span> [<a href="https://arxiv.org/pdf/2411.10321">pdf</a>, <a href="https://arxiv.org/format/2411.10321">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Probabilistic Prior Driven Attention Mechanism Based on Diffusion Model for Imaging Through Atmospheric Turbulence </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+G">Guodong Sun</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Q">Qixiang Ma</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Liqiang Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hongwei Wang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Zixuan Gao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Haotian Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10321v1-abstract-short" style="display: inline;"> Atmospheric turbulence introduces severe spatial and geometric distortions, challenging traditional image restoration methods. We propose the Probabilistic Prior Turbulence Removal Network (PPTRN), which combines probabilistic diffusion-based prior modeling with Transformer-driven feature extraction to address this issue. PPTRN employs a two-stage approach: first, a latent encoder and Transformer… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10321v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10321v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10321v1-abstract-full" style="display: none;"> Atmospheric turbulence introduces severe spatial and geometric distortions, challenging traditional image restoration methods. We propose the Probabilistic Prior Turbulence Removal Network (PPTRN), which combines probabilistic diffusion-based prior modeling with Transformer-driven feature extraction to address this issue. PPTRN employs a two-stage approach: first, a latent encoder and Transformer are jointly trained on clear images to establish robust feature representations. Then, a Denoising Diffusion Probabilistic Model (DDPM) models prior distributions over latent vectors, guiding the Transformer in capturing diverse feature variations essential for restoration. A key innovation in PPTRN is the Probabilistic Prior Driven Cross Attention mechanism, which integrates the DDPM-generated prior with feature embeddings to reduce artifacts and enhance spatial coherence. Extensive experiments validate that PPTRN significantly improves restoration quality on turbulence-degraded images, setting a new benchmark in clarity and structural fidelity. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10321v1-abstract-full').style.display = 'none'; document.getElementById('2411.10321v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09429">arXiv:2411.09429</a> <span> [<a href="https://arxiv.org/pdf/2411.09429">pdf</a>, <a href="https://arxiv.org/format/2411.09429">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Materials Science">cond-mat.mtrl-sci</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Superconductivity">cond-mat.supr-con</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> AI-driven inverse design of materials: Past, present and future </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Han%2C+X">Xiao-Qi Han</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xin-De Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+M">Meng-Yuan Xu</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Z">Zhen Feng</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+B">Bo-Wen Yao</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+P">Peng-Jie Guo</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Ze-Feng Gao</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Z">Zhong-Yi Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09429v3-abstract-short" style="display: inline;"> The discovery of advanced materials is the cornerstone of human technological development and progress. The structures of materials and their corresponding properties are essentially the result of a complex interplay of multiple degrees of freedom such as lattice, charge, spin, symmetry, and topology. This poses significant challenges for the inverse design methods of materials. Humans have long e… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09429v3-abstract-full').style.display = 'inline'; document.getElementById('2411.09429v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09429v3-abstract-full" style="display: none;"> The discovery of advanced materials is the cornerstone of human technological development and progress. The structures of materials and their corresponding properties are essentially the result of a complex interplay of multiple degrees of freedom such as lattice, charge, spin, symmetry, and topology. This poses significant challenges for the inverse design methods of materials. Humans have long explored new materials through a large number of experiments and proposed corresponding theoretical systems to predict new material properties and structures. With the improvement of computational power, researchers have gradually developed various electronic structure calculation methods, such as the density functional theory and high-throughput computational methods. Recently, the rapid development of artificial intelligence technology in the field of computer science has enabled the effective characterization of the implicit association between material properties and structures, thus opening up an efficient paradigm for the inverse design of functional materials. A significant progress has been made in inverse design of materials based on generative and discriminative models, attracting widespread attention from researchers. Considering this rapid technological progress, in this survey, we look back on the latest advancements in AI-driven inverse design of materials by introducing the background, key findings, and mainstream technological development routes. In addition, we summarize the remaining issues for future directions. This survey provides the latest overview of AI-driven inverse design of materials, which can serve as a useful resource for researchers. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09429v3-abstract-full').style.display = 'none'; document.getElementById('2411.09429v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">44 pages, 6 figures, 2 tables</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Gao%2C+Z&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Gao%2C+Z&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Gao%2C+Z&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Gao%2C+Z&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Gao%2C+Z&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Gao%2C+Z&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository