CINXE.COM
Search | arXiv e-print repository
<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 557 results for author: <span class="mathjax">Zheng, S</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Zheng%2C+S">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Zheng, S"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Zheng%2C+S&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Zheng, S"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Zheng%2C+S&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Zheng%2C+S&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Zheng%2C+S&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Zheng%2C+S&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Zheng%2C+S&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Zheng%2C+S&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.16815">arXiv:2411.16815</a> <span> [<a href="https://arxiv.org/pdf/2411.16815">pdf</a>, <a href="https://arxiv.org/format/2411.16815">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> FREE-Merging: Fourier Transform for Model Merging with Lightweight Experts </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zheng%2C+S">Shenghe Zheng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hongzhi Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.16815v1-abstract-short" style="display: inline;"> In the current era of rapid expansion in model scale, there is an increasing availability of open-source model weights for various tasks. However, the capabilities of a single fine-tuned model often fall short of meeting diverse deployment needs. Model merging has thus emerged as a widely focused method for efficiently building a single model tailored for multiple tasks combined from existing mode… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16815v1-abstract-full').style.display = 'inline'; document.getElementById('2411.16815v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.16815v1-abstract-full" style="display: none;"> In the current era of rapid expansion in model scale, there is an increasing availability of open-source model weights for various tasks. However, the capabilities of a single fine-tuned model often fall short of meeting diverse deployment needs. Model merging has thus emerged as a widely focused method for efficiently building a single model tailored for multiple tasks combined from existing models. Nevertheless, existing model merging methods face challenging trade-offs between performance and deployment costs, primarily due to task conflicts within the merged network. Our analysis of neural networks reveals that some task-specific information introduced by fine-tuning minimally enhances performance but heavily impacts generalization, leading to task conflicts. To mitigate the impact of this information, we propose FR-Merging, an innovative method that leverages frequency domain information to efficiently filter harmful specialized information, thereby minimizing the impact of task conflicts on the backbone with minimal cost. Since performance loss is inevitable with cost-free merging methods, we introduce a lightweight task-specific expert that can be dynamically integrated during inference to compensate for information loss. This framework, FREE-Merging (FR-Merging with lightweight experts), strikes a balanced trade-off between training cost, inference speed, storage requirements, and performance. We demonstrate the effectiveness of both FR-Merging and FREE-Merging on multiple tasks across CV, NLP, and Multi-Modal domains and show that they can be flexibly adapted to meet specific needs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16815v1-abstract-full').style.display = 'none'; document.getElementById('2411.16815v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">16 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.16694">arXiv:2411.16694</a> <span> [<a href="https://arxiv.org/pdf/2411.16694">pdf</a>, <a href="https://arxiv.org/format/2411.16694">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Biomolecules">q-bio.BM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Reaction-conditioned De Novo Enzyme Design with GENzyme </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hua%2C+C">Chenqing Hua</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+J">Jiarui Lu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yong Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+O">Odin Zhang</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+J">Jian Tang</a>, <a href="/search/cs?searchtype=author&query=Ying%2C+R">Rex Ying</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+W">Wengong Jin</a>, <a href="/search/cs?searchtype=author&query=Wolf%2C+G">Guy Wolf</a>, <a href="/search/cs?searchtype=author&query=Precup%2C+D">Doina Precup</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+S">Shuangjia Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.16694v1-abstract-short" style="display: inline;"> The introduction of models like RFDiffusionAA, AlphaFold3, AlphaProteo, and Chai1 has revolutionized protein structure modeling and interaction prediction, primarily from a binding perspective, focusing on creating ideal lock-and-key models. However, these methods can fall short for enzyme-substrate interactions, where perfect binding models are rare, and induced fit states are more common. To add… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16694v1-abstract-full').style.display = 'inline'; document.getElementById('2411.16694v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.16694v1-abstract-full" style="display: none;"> The introduction of models like RFDiffusionAA, AlphaFold3, AlphaProteo, and Chai1 has revolutionized protein structure modeling and interaction prediction, primarily from a binding perspective, focusing on creating ideal lock-and-key models. However, these methods can fall short for enzyme-substrate interactions, where perfect binding models are rare, and induced fit states are more common. To address this, we shift to a functional perspective for enzyme design, where the enzyme function is defined by the reaction it catalyzes. Here, we introduce \textsc{GENzyme}, a \textit{de novo} enzyme design model that takes a catalytic reaction as input and generates the catalytic pocket, full enzyme structure, and enzyme-substrate binding complex. \textsc{GENzyme} is an end-to-end, three-staged model that integrates (1) a catalytic pocket generation and sequence co-design module, (2) a pocket inpainting and enzyme inverse folding module, and (3) a binding and screening module to optimize and predict enzyme-substrate complexes. The entire design process is driven by the catalytic reaction being targeted. This reaction-first approach allows for more accurate and biologically relevant enzyme design, potentially surpassing structure-based and binding-focused models in creating enzymes capable of catalyzing specific reactions. We provide \textsc{GENzyme} code at https://github.com/WillHua127/GENzyme. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16694v1-abstract-full').style.display = 'none'; document.getElementById('2411.16694v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.16156">arXiv:2411.16156</a> <span> [<a href="https://arxiv.org/pdf/2411.16156">pdf</a>, <a href="https://arxiv.org/format/2411.16156">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> VideoOrion: Tokenizing Object Dynamics in Videos </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Feng%2C+Y">Yicheng Feng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yijiang Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wanpeng Zhang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+S">Sipeng Zheng</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Z">Zongqing Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.16156v1-abstract-short" style="display: inline;"> We present VideoOrion, a Video Large Language Model (Video-LLM) that explicitly captures the key semantic information in videos--the spatial-temporal dynamics of objects throughout the videos. VideoOrion employs expert vision models to extract object dynamics through a detect-segment-track pipeline, encoding them into a set of object tokens by aggregating spatial-temporal object features. Our meth… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16156v1-abstract-full').style.display = 'inline'; document.getElementById('2411.16156v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.16156v1-abstract-full" style="display: none;"> We present VideoOrion, a Video Large Language Model (Video-LLM) that explicitly captures the key semantic information in videos--the spatial-temporal dynamics of objects throughout the videos. VideoOrion employs expert vision models to extract object dynamics through a detect-segment-track pipeline, encoding them into a set of object tokens by aggregating spatial-temporal object features. Our method addresses the persistent challenge in Video-LLMs of efficiently compressing high-dimensional video data into semantic tokens that are comprehensible to LLMs. Compared to prior methods which resort to downsampling the original video or aggregating visual tokens using resamplers, leading to information loss and entangled semantics, VideoOrion not only offers a more natural and efficient way to derive compact, disentangled semantic representations but also enables explicit object modeling of video content with minimal computational cost. Moreover, the introduced object tokens naturally allow VideoOrion to accomplish video-based referring tasks. Experimental results show that VideoOrion can learn to make good use of the object tokens, and achieves competitive results on both general video question answering and video-based referring benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16156v1-abstract-full').style.display = 'none'; document.getElementById('2411.16156v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11363">arXiv:2411.11363</a> <span> [<a href="https://arxiv.org/pdf/2411.11363">pdf</a>, <a href="https://arxiv.org/format/2411.11363">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> GPS-Gaussian+: Generalizable Pixel-wise 3D Gaussian Splatting for Real-Time Human-Scene Rendering from Sparse Views </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+B">Boyao Zhou</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+S">Shunyuan Zheng</a>, <a href="/search/cs?searchtype=author&query=Tu%2C+H">Hanzhang Tu</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+R">Ruizhi Shao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+B">Boning Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shengping Zhang</a>, <a href="/search/cs?searchtype=author&query=Nie%2C+L">Liqiang Nie</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yebin Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11363v1-abstract-short" style="display: inline;"> Differentiable rendering techniques have recently shown promising results for free-viewpoint video synthesis of characters. However, such methods, either Gaussian Splatting or neural implicit rendering, typically necessitate per-subject optimization which does not meet the requirement of real-time rendering in an interactive application. We propose a generalizable Gaussian Splatting approach for h… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11363v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11363v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11363v1-abstract-full" style="display: none;"> Differentiable rendering techniques have recently shown promising results for free-viewpoint video synthesis of characters. However, such methods, either Gaussian Splatting or neural implicit rendering, typically necessitate per-subject optimization which does not meet the requirement of real-time rendering in an interactive application. We propose a generalizable Gaussian Splatting approach for high-resolution image rendering under a sparse-view camera setting. To this end, we introduce Gaussian parameter maps defined on the source views and directly regress Gaussian properties for instant novel view synthesis without any fine-tuning or optimization. We train our Gaussian parameter regression module on human-only data or human-scene data, jointly with a depth estimation module to lift 2D parameter maps to 3D space. The proposed framework is fully differentiable with both depth and rendering supervision or with only rendering supervision. We further introduce a regularization term and an epipolar attention mechanism to preserve geometry consistency between two source views, especially when neglecting depth supervision. Experiments on several datasets demonstrate that our method outperforms state-of-the-art methods while achieving an exceeding rendering speed. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11363v1-abstract-full').style.display = 'none'; document.getElementById('2411.11363v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Journal extension of CVPR 2024,Project page:https://yaourtb.github.io/GPS-Gaussian+</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05005">arXiv:2411.05005</a> <span> [<a href="https://arxiv.org/pdf/2411.05005">pdf</a>, <a href="https://arxiv.org/format/2411.05005">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Diff-2-in-1: Bridging Generation and Dense Perception with Diffusion Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zheng%2C+S">Shuhong Zheng</a>, <a href="/search/cs?searchtype=author&query=Bao%2C+Z">Zhipeng Bao</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+R">Ruoyu Zhao</a>, <a href="/search/cs?searchtype=author&query=Hebert%2C+M">Martial Hebert</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yu-Xiong Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05005v1-abstract-short" style="display: inline;"> Beyond high-fidelity image synthesis, diffusion models have recently exhibited promising results in dense visual perception tasks. However, most existing work treats diffusion models as a standalone component for perception tasks, employing them either solely for off-the-shelf data augmentation or as mere feature extractors. In contrast to these isolated and thus sub-optimal efforts, we introduce… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05005v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05005v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05005v1-abstract-full" style="display: none;"> Beyond high-fidelity image synthesis, diffusion models have recently exhibited promising results in dense visual perception tasks. However, most existing work treats diffusion models as a standalone component for perception tasks, employing them either solely for off-the-shelf data augmentation or as mere feature extractors. In contrast to these isolated and thus sub-optimal efforts, we introduce a unified, versatile, diffusion-based framework, Diff-2-in-1, that can simultaneously handle both multi-modal data generation and dense visual perception, through a unique exploitation of the diffusion-denoising process. Within this framework, we further enhance discriminative visual perception via multi-modal generation, by utilizing the denoising network to create multi-modal data that mirror the distribution of the original training set. Importantly, Diff-2-in-1 optimizes the utilization of the created diverse and faithful data by leveraging a novel self-improving learning mechanism. Comprehensive experimental evaluations validate the effectiveness of our framework, showcasing consistent performance improvements across various discriminative backbones and high-quality multi-modal data generation characterized by both realism and usefulness. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05005v1-abstract-full').style.display = 'none'; document.getElementById('2411.05005v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">26 pages, 14 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02265">arXiv:2411.02265</a> <span> [<a href="https://arxiv.org/pdf/2411.02265">pdf</a>, <a href="https://arxiv.org/format/2411.02265">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Hunyuan-Large: An Open-Source MoE Model with 52 Billion Activated Parameters by Tencent </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+X">Xingwu Sun</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yanfeng Chen</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yiqing Huang</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+R">Ruobing Xie</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jiaqi Zhu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kai Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shuaipeng Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhen Yang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+J">Jonny Han</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+X">Xiaobo Shu</a>, <a href="/search/cs?searchtype=author&query=Bu%2C+J">Jiahao Bu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhongzhi Chen</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+X">Xuemeng Huang</a>, <a href="/search/cs?searchtype=author&query=Lian%2C+F">Fengzong Lian</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Saiyong Yang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+J">Jianfeng Yan</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+Y">Yuyuan Zeng</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+X">Xiaoqin Ren</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+C">Chao Yu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+L">Lulu Wu</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+Y">Yue Mao</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+J">Jun Xia</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+T">Tao Yang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+S">Suncong Zheng</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+K">Kan Wu</a> , et al. (83 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02265v3-abstract-short" style="display: inline;"> In this paper, we introduce Hunyuan-Large, which is currently the largest open-source Transformer-based mixture of experts model, with a total of 389 billion parameters and 52 billion activation parameters, capable of handling up to 256K tokens. We conduct a thorough evaluation of Hunyuan-Large's superior performance across various benchmarks including language understanding and generation, logica… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02265v3-abstract-full').style.display = 'inline'; document.getElementById('2411.02265v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02265v3-abstract-full" style="display: none;"> In this paper, we introduce Hunyuan-Large, which is currently the largest open-source Transformer-based mixture of experts model, with a total of 389 billion parameters and 52 billion activation parameters, capable of handling up to 256K tokens. We conduct a thorough evaluation of Hunyuan-Large's superior performance across various benchmarks including language understanding and generation, logical reasoning, mathematical problem-solving, coding, long-context, and aggregated tasks, where it outperforms LLama3.1-70B and exhibits comparable performance when compared to the significantly larger LLama3.1-405B model. Key practice of Hunyuan-Large include large-scale synthetic data that is orders larger than in previous literature, a mixed expert routing strategy, a key-value cache compression technique, and an expert-specific learning rate strategy. Additionally, we also investigate the scaling laws and learning rate schedule of mixture of experts models, providing valuable insights and guidances for future model development and optimization. The code and checkpoints of Hunyuan-Large are released to facilitate future innovations and applications. Codes: https://github.com/Tencent/Hunyuan-Large Models: https://huggingface.co/tencent/Tencent-Hunyuan-Large <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02265v3-abstract-full').style.display = 'none'; document.getElementById('2411.02265v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages, 4 Figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01842">arXiv:2411.01842</a> <span> [<a href="https://arxiv.org/pdf/2411.01842">pdf</a>, <a href="https://arxiv.org/format/2411.01842">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> ElasTST: Towards Robust Varied-Horizon Forecasting with Elastic Time-Series Transformer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiawen Zhang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+S">Shun Zheng</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+X">Xumeng Wen</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+X">Xiaofang Zhou</a>, <a href="/search/cs?searchtype=author&query=Bian%2C+J">Jiang Bian</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jia Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01842v1-abstract-short" style="display: inline;"> Numerous industrial sectors necessitate models capable of providing robust forecasts across various horizons. Despite the recent strides in crafting specific architectures for time-series forecasting and developing pre-trained universal models, a comprehensive examination of their capability in accommodating varied-horizon forecasting during inference is still lacking. This paper bridges this gap… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01842v1-abstract-full').style.display = 'inline'; document.getElementById('2411.01842v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01842v1-abstract-full" style="display: none;"> Numerous industrial sectors necessitate models capable of providing robust forecasts across various horizons. Despite the recent strides in crafting specific architectures for time-series forecasting and developing pre-trained universal models, a comprehensive examination of their capability in accommodating varied-horizon forecasting during inference is still lacking. This paper bridges this gap through the design and evaluation of the Elastic Time-Series Transformer (ElasTST). The ElasTST model incorporates a non-autoregressive design with placeholders and structured self-attention masks, warranting future outputs that are invariant to adjustments in inference horizons. A tunable version of rotary position embedding is also integrated into ElasTST to capture time-series-specific periods and enhance adaptability to different horizons. Additionally, ElasTST employs a multi-scale patch design, effectively integrating both fine-grained and coarse-grained information. During the training phase, ElasTST uses a horizon reweighting strategy that approximates the effect of random sampling across multiple horizons with a single fixed horizon setting. Through comprehensive experiments and comparisons with state-of-the-art time-series architectures and contemporary foundation models, we demonstrate the efficacy of ElasTST's unique design elements. Our findings position ElasTST as a robust solution for the practical necessity of varied-horizon forecasting. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01842v1-abstract-full').style.display = 'none'; document.getElementById('2411.01842v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.24220">arXiv:2410.24220</a> <span> [<a href="https://arxiv.org/pdf/2410.24220">pdf</a>, <a href="https://arxiv.org/ps/2410.24220">ps</a>, <a href="https://arxiv.org/format/2410.24220">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Quantitative Methods">q-bio.QM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Bridging Geometric States via Geometric Diffusion Bridge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Luo%2C+S">Shengjie Luo</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yixian Xu</a>, <a href="/search/cs?searchtype=author&query=He%2C+D">Di He</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+S">Shuxin Zheng</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+T">Tie-Yan Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Liwei Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.24220v1-abstract-short" style="display: inline;"> The accurate prediction of geometric state evolution in complex systems is critical for advancing scientific domains such as quantum chemistry and material modeling. Traditional experimental and computational methods face challenges in terms of environmental constraints and computational demands, while current deep learning approaches still fall short in terms of precision and generality. In this… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.24220v1-abstract-full').style.display = 'inline'; document.getElementById('2410.24220v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.24220v1-abstract-full" style="display: none;"> The accurate prediction of geometric state evolution in complex systems is critical for advancing scientific domains such as quantum chemistry and material modeling. Traditional experimental and computational methods face challenges in terms of environmental constraints and computational demands, while current deep learning approaches still fall short in terms of precision and generality. In this work, we introduce the Geometric Diffusion Bridge (GDB), a novel generative modeling framework that accurately bridges initial and target geometric states. GDB leverages a probabilistic approach to evolve geometric state distributions, employing an equivariant diffusion bridge derived by a modified version of Doob's $h$-transform for connecting geometric states. This tailored diffusion process is anchored by initial and target geometric states as fixed endpoints and governed by equivariant transition kernels. Moreover, trajectory data can be seamlessly leveraged in our GDB framework by using a chain of equivariant diffusion bridges, providing a more detailed and accurate characterization of evolution dynamics. Theoretically, we conduct a thorough examination to confirm our framework's ability to preserve joint distributions of geometric states and capability to completely model the underlying dynamics inducing trajectory distributions with negligible error. Experimental evaluations across various real-world scenarios show that GDB surpasses existing state-of-the-art approaches, opening up a new pathway for accurately bridging geometric states and tackling crucial scientific challenges with improved accuracy and applicability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.24220v1-abstract-full').style.display = 'none'; document.getElementById('2410.24220v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">33 pages, 5 tables; NeurIPS 2024 Camera Ready version</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.21667">arXiv:2410.21667</a> <span> [<a href="https://arxiv.org/pdf/2410.21667">pdf</a>, <a href="https://arxiv.org/format/2410.21667">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Revisiting Multi-Granularity Representation via Group Contrastive Learning for Unsupervised Vehicle Re-identification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chang%2C+Z">Zhigang Chang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+S">Shibao Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.21667v1-abstract-short" style="display: inline;"> Vehicle re-identification (Vehicle ReID) aims at retrieving vehicle images across disjoint surveillance camera views. The majority of vehicle ReID research is heavily reliant upon supervisory labels from specific human-collected datasets for training. When applied to the large-scale real-world scenario, these models will experience dreadful performance declines due to the notable domain discrepanc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21667v1-abstract-full').style.display = 'inline'; document.getElementById('2410.21667v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.21667v1-abstract-full" style="display: none;"> Vehicle re-identification (Vehicle ReID) aims at retrieving vehicle images across disjoint surveillance camera views. The majority of vehicle ReID research is heavily reliant upon supervisory labels from specific human-collected datasets for training. When applied to the large-scale real-world scenario, these models will experience dreadful performance declines due to the notable domain discrepancy between the source dataset and the target. To address this challenge, in this paper, we propose an unsupervised vehicle ReID framework (MGR-GCL). It integrates a multi-granularity CNN representation for learning discriminative transferable features and a contrastive learning module responsible for efficient domain adaptation in the unlabeled target domain. Specifically, after training the proposed Multi-Granularity Representation (MGR) on the labeled source dataset, we propose a group contrastive learning module (GCL) to generate pseudo labels for the target dataset, facilitating the domain adaptation process. We conducted extensive experiments and the results demonstrated our superiority against existing state-of-the-art methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21667v1-abstract-full').style.display = 'none'; document.getElementById('2410.21667v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.4.7 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.21465">arXiv:2410.21465</a> <span> [<a href="https://arxiv.org/pdf/2410.21465">pdf</a>, <a href="https://arxiv.org/format/2410.21465">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> ShadowKV: KV Cache in Shadows for High-Throughput Long-Context LLM Inference </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+H">Hanshi Sun</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+L">Li-Wen Chang</a>, <a href="/search/cs?searchtype=author&query=Bao%2C+W">Wenlei Bao</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+S">Size Zheng</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+N">Ningxin Zheng</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xin Liu</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+H">Harry Dong</a>, <a href="/search/cs?searchtype=author&query=Chi%2C+Y">Yuejie Chi</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+B">Beidi Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.21465v1-abstract-short" style="display: inline;"> With the widespread deployment of long-context large language models (LLMs), there has been a growing demand for efficient support of high-throughput inference. However, as the key-value (KV) cache expands with the sequence length, the increasing memory footprint and the need to access it for each token generation both result in low throughput when serving long-context LLMs. While various dynamic… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21465v1-abstract-full').style.display = 'inline'; document.getElementById('2410.21465v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.21465v1-abstract-full" style="display: none;"> With the widespread deployment of long-context large language models (LLMs), there has been a growing demand for efficient support of high-throughput inference. However, as the key-value (KV) cache expands with the sequence length, the increasing memory footprint and the need to access it for each token generation both result in low throughput when serving long-context LLMs. While various dynamic sparse attention methods have been proposed to speed up inference while maintaining generation quality, they either fail to sufficiently reduce GPU memory consumption or introduce significant decoding latency by offloading the KV cache to the CPU. We present ShadowKV, a high-throughput long-context LLM inference system that stores the low-rank key cache and offloads the value cache to reduce the memory footprint for larger batch sizes and longer sequences. To minimize decoding latency, ShadowKV employs an accurate KV selection strategy that reconstructs minimal sparse KV pairs on-the-fly. By evaluating ShadowKV on a broad range of benchmarks, including RULER, LongBench, and Needle In A Haystack, and models like Llama-3.1-8B, Llama-3-8B-1M, GLM-4-9B-1M, Yi-9B-200K, Phi-3-Mini-128K, and Qwen2-7B-128K, we demonstrate that it can support up to 6$\times$ larger batch sizes and boost throughput by up to 3.04$\times$ on an A100 GPU without sacrificing accuracy, even surpassing the performance achievable with infinite batch size under the assumption of infinite GPU memory. The code is available at https://github.com/bytedance/ShadowKV. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21465v1-abstract-full').style.display = 'none'; document.getElementById('2410.21465v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.21269">arXiv:2410.21269</a> <span> [<a href="https://arxiv.org/pdf/2410.21269">pdf</a>, <a href="https://arxiv.org/format/2410.21269">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> OmniSep: Unified Omni-Modality Sound Separation with Query-Mixup </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cheng%2C+X">Xize Cheng</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+S">Siqi Zheng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zehan Wang</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+M">Minghui Fang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Ziang Zhang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+R">Rongjie Huang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Z">Ziyang Ma</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+S">Shengpeng Ji</a>, <a href="/search/cs?searchtype=author&query=Zuo%2C+J">Jialong Zuo</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+T">Tao Jin</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Zhou Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.21269v1-abstract-short" style="display: inline;"> The scaling up has brought tremendous success in the fields of vision and language in recent years. When it comes to audio, however, researchers encounter a major challenge in scaling up the training data, as most natural audio contains diverse interfering signals. To address this limitation, we introduce Omni-modal Sound Separation (OmniSep), a novel framework capable of isolating clean soundtrac… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21269v1-abstract-full').style.display = 'inline'; document.getElementById('2410.21269v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.21269v1-abstract-full" style="display: none;"> The scaling up has brought tremendous success in the fields of vision and language in recent years. When it comes to audio, however, researchers encounter a major challenge in scaling up the training data, as most natural audio contains diverse interfering signals. To address this limitation, we introduce Omni-modal Sound Separation (OmniSep), a novel framework capable of isolating clean soundtracks based on omni-modal queries, encompassing both single-modal and multi-modal composed queries. Specifically, we introduce the Query-Mixup strategy, which blends query features from different modalities during training. This enables OmniSep to optimize multiple modalities concurrently, effectively bringing all modalities under a unified framework for sound separation. We further enhance this flexibility by allowing queries to influence sound separation positively or negatively, facilitating the retention or removal of specific sounds as desired. Finally, OmniSep employs a retrieval-augmented approach known as Query-Aug, which enables open-vocabulary sound separation. Experimental evaluations on MUSIC, VGGSOUND-CLEAN+, and MUSIC-CLEAN+ datasets demonstrate effectiveness of OmniSep, achieving state-of-the-art performance in text-, image-, and audio-queried sound separation tasks. For samples and further information, please visit the demo page at \url{https://omnisep.github.io/}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21269v1-abstract-full').style.display = 'none'; document.getElementById('2410.21269v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Working in progress</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18995">arXiv:2410.18995</a> <span> [<a href="https://arxiv.org/pdf/2410.18995">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Management of Large-Scale Optical Networks through RFID Technology Integration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zheng%2C+X">Xiaoying Zheng</a>, <a href="/search/cs?searchtype=author&query=Xuan%2C+X">Xingqi Xuan</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+S">Shilie Zheng</a>, <a href="/search/cs?searchtype=author&query=Hui%2C+X">Xiaonan Hui</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xianmin Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18995v1-abstract-short" style="display: inline;"> Managing large-scale optical distribution networks is a daunting task. This paper introduces a novel solution using radio frequency identification (RFID) technology to transform the procedure we monitor and manage the complex optical network dumb resources (ONDR). By implementing and deploying removable RFID tag pairing based on the serial peripheral interface (SPI) communication protocol, the sys… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18995v1-abstract-full').style.display = 'inline'; document.getElementById('2410.18995v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18995v1-abstract-full" style="display: none;"> Managing large-scale optical distribution networks is a daunting task. This paper introduces a novel solution using radio frequency identification (RFID) technology to transform the procedure we monitor and manage the complex optical network dumb resources (ONDR). By implementing and deploying removable RFID tag pairing based on the serial peripheral interface (SPI) communication protocol, the system identifies 30 pairs of tags within one second, even at densities of up to 5.1 tagged components per square inch of patch panel surface area. The integration of light-emitting diode (LED) navigation aids in indicating correctly matched interfaces, effectively addressing the complexities associated with large-scale fiber matching. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18995v1-abstract-full').style.display = 'none'; document.getElementById('2410.18995v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">2 pages, 4 figures, conference</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.17799">arXiv:2410.17799</a> <span> [<a href="https://arxiv.org/pdf/2410.17799">pdf</a>, <a href="https://arxiv.org/format/2410.17799">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> OmniFlatten: An End-to-end GPT Model for Seamless Voice Conversation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Qinglin Zhang</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+L">Luyao Cheng</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+C">Chong Deng</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Q">Qian Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wen Wang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+S">Siqi Zheng</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jiaqing Liu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+H">Hai Yu</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+C">Chaohong Tan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.17799v1-abstract-short" style="display: inline;"> Full-duplex spoken dialogue systems significantly advance over traditional turn-based dialogue systems, as they allow simultaneous bidirectional communication, closely mirroring human-human interactions. However, achieving low latency and natural interactions in full-duplex dialogue systems remains a significant challenge, especially considering human conversation dynamics such as interruptions, b… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17799v1-abstract-full').style.display = 'inline'; document.getElementById('2410.17799v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.17799v1-abstract-full" style="display: none;"> Full-duplex spoken dialogue systems significantly advance over traditional turn-based dialogue systems, as they allow simultaneous bidirectional communication, closely mirroring human-human interactions. However, achieving low latency and natural interactions in full-duplex dialogue systems remains a significant challenge, especially considering human conversation dynamics such as interruptions, backchannels, and overlapping speech. In this paper, we introduce a novel End-to-End GPT-based model OmniFlatten for full-duplex conversation, capable of effectively modeling the complex behaviors inherent to natural conversations with low latency. To achieve full-duplex communication capabilities, we propose a multi-stage post-training scheme that progressively adapts a text-based large language model (LLM) backbone into a speech-text dialogue LLM, capable of generating text and speech in real time, without modifying the architecture of the backbone LLM. The training process comprises three stages: modality alignment, half-duplex dialogue learning, and full-duplex dialogue learning. Throughout all training stages, we standardize the data using a flattening operation, which allows us to unify the training methods and the model architecture across different modalities and tasks. Our approach offers a straightforward modeling technique and a promising research direction for developing efficient and natural end-to-end full-duplex spoken dialogue systems. Audio samples of dialogues generated by OmniFlatten can be found at this web site (https://omniflatten.github.io/). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17799v1-abstract-full').style.display = 'none'; document.getElementById('2410.17799v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Work in progress</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.17095">arXiv:2410.17095</a> <span> [<a href="https://arxiv.org/pdf/2410.17095">pdf</a>, <a href="https://arxiv.org/format/2410.17095">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Inferentially-Private Private Information </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shuaiqi Wang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+S">Shuran Zheng</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Z">Zinan Lin</a>, <a href="/search/cs?searchtype=author&query=Fanti%2C+G">Giulia Fanti</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Z+S">Zhiwei Steven Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.17095v1-abstract-short" style="display: inline;"> Information disclosure can compromise privacy when revealed information is correlated with private information. We consider the notion of inferential privacy, which measures privacy leakage by bounding the inferential power a Bayesian adversary can gain by observing a released signal. Our goal is to devise an inferentially-private private information structure that maximizes the informativeness of… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17095v1-abstract-full').style.display = 'inline'; document.getElementById('2410.17095v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.17095v1-abstract-full" style="display: none;"> Information disclosure can compromise privacy when revealed information is correlated with private information. We consider the notion of inferential privacy, which measures privacy leakage by bounding the inferential power a Bayesian adversary can gain by observing a released signal. Our goal is to devise an inferentially-private private information structure that maximizes the informativeness of the released signal, following the Blackwell ordering principle, while adhering to inferential privacy constraints. To achieve this, we devise an efficient release mechanism that achieves the inferentially-private Blackwell optimal private information structure for the setting where the private information is binary. Additionally, we propose a programming approach to compute the optimal structure for general cases given the utility function. The design of our mechanisms builds on our geometric characterization of the Blackwell-optimal disclosure mechanisms under privacy constraints, which may be of independent interest. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17095v1-abstract-full').style.display = 'none'; document.getElementById('2410.17095v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.15283">arXiv:2410.15283</a> <span> [<a href="https://arxiv.org/pdf/2410.15283">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> TRIZ Method for Urban Building Energy Optimization: GWO-SARIMA-LSTM Forecasting model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zheng%2C+S">Shirong Zheng</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Shaobo Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhenhong Zhang</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+D">Dian Gu</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+C">Chunqiu Xia</a>, <a href="/search/cs?searchtype=author&query=Pang%2C+H">Huadong Pang</a>, <a href="/search/cs?searchtype=author&query=Ampaw%2C+E+M">Enock Mintah Ampaw</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.15283v1-abstract-short" style="display: inline;"> With the advancement of global climate change and sustainable development goals, urban building energy consumption optimization and carbon emission reduction have become the focus of research. Traditional energy consumption prediction methods often lack accuracy and adaptability due to their inability to fully consider complex energy consumption patterns, especially in dealing with seasonal fluctu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15283v1-abstract-full').style.display = 'inline'; document.getElementById('2410.15283v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.15283v1-abstract-full" style="display: none;"> With the advancement of global climate change and sustainable development goals, urban building energy consumption optimization and carbon emission reduction have become the focus of research. Traditional energy consumption prediction methods often lack accuracy and adaptability due to their inability to fully consider complex energy consumption patterns, especially in dealing with seasonal fluctuations and dynamic changes. This study proposes a hybrid deep learning model that combines TRIZ innovation theory with GWO, SARIMA and LSTM to improve the accuracy of building energy consumption prediction. TRIZ plays a key role in model design, providing innovative solutions to achieve an effective balance between energy efficiency, cost and comfort by systematically analyzing the contradictions in energy consumption optimization. GWO is used to optimize the parameters of the model to ensure that the model maintains high accuracy under different conditions. The SARIMA model focuses on capturing seasonal trends in the data, while the LSTM model handles short-term and long-term dependencies in the data, further improving the accuracy of the prediction. The main contribution of this research is the development of a robust model that leverages the strengths of TRIZ and advanced deep learning techniques, improving the accuracy of energy consumption predictions. Our experiments demonstrate a significant 15% reduction in prediction error compared to existing models. This innovative approach not only enhances urban energy management but also provides a new framework for optimizing energy use and reducing carbon emissions, contributing to sustainable development. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15283v1-abstract-full').style.display = 'none'; document.getElementById('2410.15283v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">29 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.15040">arXiv:2410.15040</a> <span> [<a href="https://arxiv.org/pdf/2410.15040">pdf</a>, <a href="https://arxiv.org/format/2410.15040">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Retrieval Augmented Diffusion Model for Structure-informed Antibody Design and Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zichen Wang</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+Y">Yaokun Ji</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+J">Jianing Tian</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+S">Shuangjia Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.15040v1-abstract-short" style="display: inline;"> Antibodies are essential proteins responsible for immune responses in organisms, capable of specifically recognizing antigen molecules of pathogens. Recent advances in generative models have significantly enhanced rational antibody design. However, existing methods mainly create antibodies from scratch without template constraints, leading to model optimization challenges and unnatural sequences.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15040v1-abstract-full').style.display = 'inline'; document.getElementById('2410.15040v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.15040v1-abstract-full" style="display: none;"> Antibodies are essential proteins responsible for immune responses in organisms, capable of specifically recognizing antigen molecules of pathogens. Recent advances in generative models have significantly enhanced rational antibody design. However, existing methods mainly create antibodies from scratch without template constraints, leading to model optimization challenges and unnatural sequences. To address these issues, we propose a retrieval-augmented diffusion framework, termed RADAb, for efficient antibody design. Our method leverages a set of structural homologous motifs that align with query structural constraints to guide the generative model in inversely optimizing antibodies according to desired design criteria. Specifically, we introduce a structure-informed retrieval mechanism that integrates these exemplar motifs with the input backbone through a novel dual-branch denoising module, utilizing both structural and evolutionary information. Additionally, we develop a conditional diffusion model that iteratively refines the optimization process by incorporating both global context and local evolutionary conditions. Our approach is agnostic to the choice of generative models. Empirical experiments demonstrate that our method achieves state-of-the-art performance in multiple antibody inverse folding and optimization tasks, offering a new perspective on biomolecular generative models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15040v1-abstract-full').style.display = 'none'; document.getElementById('2410.15040v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.12957">arXiv:2410.12957</a> <span> [<a href="https://arxiv.org/pdf/2410.12957">pdf</a>, <a href="https://arxiv.org/format/2410.12957">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> MuVi: Video-to-Music Generation with Semantic Alignment and Rhythmic Synchronization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+R">Ruiqi Li</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+S">Siqi Zheng</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+X">Xize Cheng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Ziang Zhang</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+S">Shengpeng Ji</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Zhou Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.12957v1-abstract-short" style="display: inline;"> Generating music that aligns with the visual content of a video has been a challenging task, as it requires a deep understanding of visual semantics and involves generating music whose melody, rhythm, and dynamics harmonize with the visual narratives. This paper presents MuVi, a novel framework that effectively addresses these challenges to enhance the cohesion and immersive experience of audio-vi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12957v1-abstract-full').style.display = 'inline'; document.getElementById('2410.12957v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.12957v1-abstract-full" style="display: none;"> Generating music that aligns with the visual content of a video has been a challenging task, as it requires a deep understanding of visual semantics and involves generating music whose melody, rhythm, and dynamics harmonize with the visual narratives. This paper presents MuVi, a novel framework that effectively addresses these challenges to enhance the cohesion and immersive experience of audio-visual content. MuVi analyzes video content through a specially designed visual adaptor to extract contextually and temporally relevant features. These features are used to generate music that not only matches the video's mood and theme but also its rhythm and pacing. We also introduce a contrastive music-visual pre-training scheme to ensure synchronization, based on the periodicity nature of music phrases. In addition, we demonstrate that our flow-matching-based music generator has in-context learning ability, allowing us to control the style and genre of the generated music. Experimental results show that MuVi demonstrates superior performance in both audio quality and temporal synchronization. The generated music video samples are available at https://muvi-v2m.github.io. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12957v1-abstract-full').style.display = 'none'; document.getElementById('2410.12957v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Working in progress</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.06044">arXiv:2410.06044</a> <span> [<a href="https://arxiv.org/pdf/2410.06044">pdf</a>, <a href="https://arxiv.org/format/2410.06044">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> HyperDet: Generalizable Detection of Synthesized Images by Generating and Merging A Mixture of Hyper LoRAs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cao%2C+H">Huangsen Cao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yongwei Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yinfeng Liu</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+S">Sixian Zheng</a>, <a href="/search/cs?searchtype=author&query=Lv%2C+K">Kangtao Lv</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhimeng Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Bo Zhang</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+X">Xin Ding</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+F">Fei Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.06044v1-abstract-short" style="display: inline;"> The emergence of diverse generative vision models has recently enabled the synthesis of visually realistic images, underscoring the critical need for effectively detecting these generated images from real photos. Despite advances in this field, existing detection approaches often struggle to accurately identify synthesized images generated by different generative models. In this work, we introduce… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06044v1-abstract-full').style.display = 'inline'; document.getElementById('2410.06044v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.06044v1-abstract-full" style="display: none;"> The emergence of diverse generative vision models has recently enabled the synthesis of visually realistic images, underscoring the critical need for effectively detecting these generated images from real photos. Despite advances in this field, existing detection approaches often struggle to accurately identify synthesized images generated by different generative models. In this work, we introduce a novel and generalizable detection framework termed HyperDet, which innovatively captures and integrates shared knowledge from a collection of functionally distinct and lightweight expert detectors. HyperDet leverages a large pretrained vision model to extract general detection features while simultaneously capturing and enhancing task-specific features. To achieve this, HyperDet first groups SRM filters into five distinct groups to efficiently capture varying levels of pixel artifacts based on their different functionality and complexity. Then, HyperDet utilizes a hypernetwork to generate LoRA model weights with distinct embedding parameters. Finally, we merge the LoRA networks to form an efficient model ensemble. Also, we propose a novel objective function that balances the pixel and semantic artifacts effectively. Extensive experiments on the UnivFD and Fake2M datasets demonstrate the effectiveness of our approach, achieving state-of-the-art performance. Moreover, our work paves a new way to establish generalizable domain-specific fake image detectors based on pretrained large vision models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06044v1-abstract-full').style.display = 'none'; document.getElementById('2410.06044v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.03311">arXiv:2410.03311</a> <span> [<a href="https://arxiv.org/pdf/2410.03311">pdf</a>, <a href="https://arxiv.org/format/2410.03311">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Quo Vadis, Motion Generation? From Large Language Models to Large Motion Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Ye Wang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+S">Sipeng Zheng</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+B">Bin Cao</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+Q">Qianshan Wei</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+Q">Qin Jin</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Z">Zongqing Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.03311v1-abstract-short" style="display: inline;"> Inspired by the recent success of LLMs, the field of human motion understanding has increasingly shifted towards the development of large motion models. Despite some progress, current state-of-the-art works remain far from achieving truly generalist models, largely due to the lack of large-scale, high-quality motion data. To address this, we present MotionBase, the first million-level motion gener… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03311v1-abstract-full').style.display = 'inline'; document.getElementById('2410.03311v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.03311v1-abstract-full" style="display: none;"> Inspired by the recent success of LLMs, the field of human motion understanding has increasingly shifted towards the development of large motion models. Despite some progress, current state-of-the-art works remain far from achieving truly generalist models, largely due to the lack of large-scale, high-quality motion data. To address this, we present MotionBase, the first million-level motion generation benchmark, offering 15 times the data volume of the previous largest dataset, and featuring multimodal data with hierarchically detailed text descriptions. By leveraging this vast dataset, our large motion model demonstrates strong performance across a broad range of motions, including unseen ones. Through systematic investigation, we underscore the importance of scaling both data and model size, with synthetic data and pseudo labels playing a crucial role in mitigating data acquisition costs. Moreover, our research reveals the limitations of existing evaluation metrics, particularly in handling out-of-domain text instructions -- an issue that has long been overlooked. In addition to these, we introduce a novel 2D lookup-free approach for motion tokenization, which preserves motion information and expands codebook capacity, further enhancing the representative ability of large motion models. The release of MotionBase and the insights gained from this study are expected to pave the way for the development of more powerful and versatile motion generation models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03311v1-abstract-full').style.display = 'none'; document.getElementById('2410.03311v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.02155">arXiv:2410.02155</a> <span> [<a href="https://arxiv.org/pdf/2410.02155">pdf</a>, <a href="https://arxiv.org/format/2410.02155">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> From Pixels to Tokens: Byte-Pair Encoding on Quantized Visual Modalities </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wanpeng Zhang</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+Z">Zilong Xie</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Y">Yicheng Feng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yijiang Li</a>, <a href="/search/cs?searchtype=author&query=Xing%2C+X">Xingrun Xing</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+S">Sipeng Zheng</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Z">Zongqing Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.02155v2-abstract-short" style="display: inline;"> Multimodal Large Language Models have made significant strides in integrating visual and textual information, yet they often struggle with effectively aligning these modalities. We introduce a novel image tokenizer that bridges this gap by applying the principle of Byte-Pair Encoding (BPE) to visual data. Unlike conventional approaches that rely on separate visual encoders, our method directly inc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02155v2-abstract-full').style.display = 'inline'; document.getElementById('2410.02155v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.02155v2-abstract-full" style="display: none;"> Multimodal Large Language Models have made significant strides in integrating visual and textual information, yet they often struggle with effectively aligning these modalities. We introduce a novel image tokenizer that bridges this gap by applying the principle of Byte-Pair Encoding (BPE) to visual data. Unlike conventional approaches that rely on separate visual encoders, our method directly incorporates structural prior information into image tokens, mirroring the successful tokenization strategies used in text-only Large Language Models. This innovative approach enables Transformer models to more effectively learn and reason across modalities. Through theoretical analysis and extensive experiments, we demonstrate that our BPE Image Tokenizer significantly enhances MLLMs' multimodal understanding capabilities, even with limited training data. Our method not only improves performance across various benchmarks but also shows promising scalability, potentially paving the way for more efficient and capable multimodal foundation models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02155v2-abstract-full').style.display = 'none'; document.getElementById('2410.02155v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.00773">arXiv:2410.00773</a> <span> [<a href="https://arxiv.org/pdf/2410.00773">pdf</a>, <a href="https://arxiv.org/format/2410.00773">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> BabelBench: An Omni Benchmark for Code-Driven Analysis of Multimodal and Multistructured Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xuwu Wang</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+Q">Qiwen Cui</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+Y">Yunzhe Tao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yiran Wang</a>, <a href="/search/cs?searchtype=author&query=Chai%2C+Z">Ziwei Chai</a>, <a href="/search/cs?searchtype=author&query=Han%2C+X">Xiaotian Han</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+B">Boyi Liu</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+J">Jianbo Yuan</a>, <a href="/search/cs?searchtype=author&query=Su%2C+J">Jing Su</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guoyin Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+T">Tingkai Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Liyu Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+T">Tianyi Liu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+T">Tao Sun</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yufeng Zhang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+S">Sirui Zheng</a>, <a href="/search/cs?searchtype=author&query=You%2C+Q">Quanzeng You</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yang Yang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+H">Hongxia Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.00773v1-abstract-short" style="display: inline;"> Large language models (LLMs) have become increasingly pivotal across various domains, especially in handling complex data types. This includes structured data processing, as exemplified by ChartQA and ChatGPT-Ada, and multimodal unstructured data processing as seen in Visual Question Answering (VQA). These areas have attracted significant attention from both industry and academia. Despite this, th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00773v1-abstract-full').style.display = 'inline'; document.getElementById('2410.00773v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.00773v1-abstract-full" style="display: none;"> Large language models (LLMs) have become increasingly pivotal across various domains, especially in handling complex data types. This includes structured data processing, as exemplified by ChartQA and ChatGPT-Ada, and multimodal unstructured data processing as seen in Visual Question Answering (VQA). These areas have attracted significant attention from both industry and academia. Despite this, there remains a lack of unified evaluation methodologies for these diverse data handling scenarios. In response, we introduce BabelBench, an innovative benchmark framework that evaluates the proficiency of LLMs in managing multimodal multistructured data with code execution. BabelBench incorporates a dataset comprising 247 meticulously curated problems that challenge the models with tasks in perception, commonsense reasoning, logical reasoning, and so on. Besides the basic capabilities of multimodal understanding, structured data processing as well as code generation, these tasks demand advanced capabilities in exploration, planning, reasoning and debugging. Our experimental findings on BabelBench indicate that even cutting-edge models like ChatGPT 4 exhibit substantial room for improvement. The insights derived from our comprehensive analysis offer valuable guidance for future research within the community. The benchmark data can be found at https://github.com/FFD8FFE/babelbench. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00773v1-abstract-full').style.display = 'none'; document.getElementById('2410.00773v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.00327">arXiv:2410.00327</a> <span> [<a href="https://arxiv.org/pdf/2410.00327">pdf</a>, <a href="https://arxiv.org/format/2410.00327">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computational Engineering, Finance, and Science">cs.CE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Quantitative Methods">q-bio.QM</span> </div> </div> <p class="title is-5 mathjax"> EnzymeFlow: Generating Reaction-specific Enzyme Catalytic Pockets through Flow Matching and Co-Evolutionary Dynamics </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hua%2C+C">Chenqing Hua</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yong Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+D">Dinghuai Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+O">Odin Zhang</a>, <a href="/search/cs?searchtype=author&query=Luan%2C+S">Sitao Luan</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+K+K">Kevin K. Yang</a>, <a href="/search/cs?searchtype=author&query=Wolf%2C+G">Guy Wolf</a>, <a href="/search/cs?searchtype=author&query=Precup%2C+D">Doina Precup</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+S">Shuangjia Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.00327v1-abstract-short" style="display: inline;"> Enzyme design is a critical area in biotechnology, with applications ranging from drug development to synthetic biology. Traditional methods for enzyme function prediction or protein binding pocket design often fall short in capturing the dynamic and complex nature of enzyme-substrate interactions, particularly in catalytic processes. To address the challenges, we introduce EnzymeFlow, a generativ… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00327v1-abstract-full').style.display = 'inline'; document.getElementById('2410.00327v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.00327v1-abstract-full" style="display: none;"> Enzyme design is a critical area in biotechnology, with applications ranging from drug development to synthetic biology. Traditional methods for enzyme function prediction or protein binding pocket design often fall short in capturing the dynamic and complex nature of enzyme-substrate interactions, particularly in catalytic processes. To address the challenges, we introduce EnzymeFlow, a generative model that employs flow matching with hierarchical pre-training and enzyme-reaction co-evolution to generate catalytic pockets for specific substrates and catalytic reactions. Additionally, we introduce a large-scale, curated, and validated dataset of enzyme-reaction pairs, specifically designed for the catalytic pocket generation task, comprising a total of $328,192$ pairs. By incorporating evolutionary dynamics and reaction-specific adaptations, EnzymeFlow becomes a powerful model for designing enzyme pockets, which is capable of catalyzing a wide range of biochemical reactions. Experiments on the new dataset demonstrate the model's effectiveness in designing high-quality, functional enzyme catalytic pockets, paving the way for advancements in enzyme engineering and synthetic biology. We provide EnzymeFlow code at https://github.com/WillHua127/EnzymeFlow with notebook demonstration at https://github.com/WillHua127/EnzymeFlow/blob/main/enzymeflow_demo.ipynb. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00327v1-abstract-full').style.display = 'none'; document.getElementById('2410.00327v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.13292">arXiv:2409.13292</a> <span> [<a href="https://arxiv.org/pdf/2409.13292">pdf</a>, <a href="https://arxiv.org/format/2409.13292">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Exploring Text-Queried Sound Event Detection with Audio Source Separation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yin%2C+H">Han Yin</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+J">Jisheng Bai</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+Y">Yang Xiao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hui Wang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+S">Siqi Zheng</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yafeng Chen</a>, <a href="/search/cs?searchtype=author&query=Das%2C+R+K">Rohan Kumar Das</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+C">Chong Deng</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jianfeng Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.13292v1-abstract-short" style="display: inline;"> In sound event detection (SED), overlapping sound events pose a significant challenge, as certain events can be easily masked by background noise or other events, resulting in poor detection performance. To address this issue, we propose the text-queried SED (TQ-SED) framework. Specifically, we first pre-train a language-queried audio source separation (LASS) model to separate the audio tracks cor… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13292v1-abstract-full').style.display = 'inline'; document.getElementById('2409.13292v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.13292v1-abstract-full" style="display: none;"> In sound event detection (SED), overlapping sound events pose a significant challenge, as certain events can be easily masked by background noise or other events, resulting in poor detection performance. To address this issue, we propose the text-queried SED (TQ-SED) framework. Specifically, we first pre-train a language-queried audio source separation (LASS) model to separate the audio tracks corresponding to different events from the input audio. Then, multiple target SED branches are employed to detect individual events. AudioSep is a state-of-the-art LASS model, but has limitations in extracting dynamic audio information because of its pure convolutional structure for separation. To address this, we integrate a dual-path recurrent neural network block into the model. We refer to this structure as AudioSep-DP, which achieves the first place in DCASE 2024 Task 9 on language-queried audio source separation (objective single model track). Experimental results show that TQ-SED can significantly improve the SED performance, with an improvement of 7.22\% on F1 score over the conventional framework. Additionally, we setup comprehensive experiments to explore the impact of model complexity. The source code and pre-trained model are released at https://github.com/apple-yinhan/TQ-SED. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13292v1-abstract-full').style.display = 'none'; document.getElementById('2409.13292v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to ICASSP2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.09763">arXiv:2409.09763</a> <span> [<a href="https://arxiv.org/pdf/2409.09763">pdf</a>, <a href="https://arxiv.org/format/2409.09763">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Range-SLAM: Ultra-Wideband-Based Smoke-Resistant Real-Time Localization and Mapping </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yi Liu</a>, <a href="/search/cs?searchtype=author&query=Jian%2C+Z">Zhuozhu Jian</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+S">Shengtao Zheng</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Houde Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xueqian Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xinlei Chen</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+B">Bin Liang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.09763v1-abstract-short" style="display: inline;"> This paper presents Range-SLAM, a real-time, lightweight SLAM system designed to address the challenges of localization and mapping in environments with smoke and other harsh conditions using Ultra-Wideband (UWB) signals. While optical sensors like LiDAR and cameras struggle in low-visibility environments, UWB signals provide a robust alternative for real-time positioning. The proposed system uses… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09763v1-abstract-full').style.display = 'inline'; document.getElementById('2409.09763v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.09763v1-abstract-full" style="display: none;"> This paper presents Range-SLAM, a real-time, lightweight SLAM system designed to address the challenges of localization and mapping in environments with smoke and other harsh conditions using Ultra-Wideband (UWB) signals. While optical sensors like LiDAR and cameras struggle in low-visibility environments, UWB signals provide a robust alternative for real-time positioning. The proposed system uses general UWB devices to achieve accurate mapping and localization without relying on expensive LiDAR or other dedicated hardware. By utilizing only the distance and Received Signal Strength Indicator (RSSI) provided by UWB sensors in relation to anchors, we combine the motion of the tag-carrying agent with raycasting algorithm to construct a 2D occupancy grid map in real time. To enhance localization in challenging conditions, a Weighted Least Squares (WLS) method is employed. Extensive real-world experiments, including smoke-filled environments and simulated <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09763v1-abstract-full').style.display = 'none'; document.getElementById('2409.09763v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.08782">arXiv:2409.08782</a> <span> [<a href="https://arxiv.org/pdf/2409.08782">pdf</a>, <a href="https://arxiv.org/format/2409.08782">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Contactless Fingerprint Recognition Using 3D Graph Matching </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cui%2C+Z">Zhe Cui</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+Y">Yuwei Jia</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+S">Siyang Zheng</a>, <a href="/search/cs?searchtype=author&query=Su%2C+F">Fei Su</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.08782v1-abstract-short" style="display: inline;"> Contactless fingerprint is a newly developed type of fingerprint, and has gained lots of attention in recent fingerprint studies. However, most existing contactless fingerprint algorithms treat contactless fingerprints as 2D plain fingerprints, and utilize similar recognition methods as traditional contact-based 2D fingerprints. This recognition approach does not consider the modality difference b… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08782v1-abstract-full').style.display = 'inline'; document.getElementById('2409.08782v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.08782v1-abstract-full" style="display: none;"> Contactless fingerprint is a newly developed type of fingerprint, and has gained lots of attention in recent fingerprint studies. However, most existing contactless fingerprint algorithms treat contactless fingerprints as 2D plain fingerprints, and utilize similar recognition methods as traditional contact-based 2D fingerprints. This recognition approach does not consider the modality difference between contactless and contact fingerprints, especially the intrinsic 3D characteristic of contactless fingerprints. This paper proposes a novel contactless fingerprint recognition algorithm that captures the revealed 3D feature of contactless fingerprints rather than the plain 2D feature. The proposed method first recovers 3D features from the input contactless fingerprint, including the 3D shape model and 3D fingerprint feature (minutiae, orientation, etc.). Then, a novel 3D graph matching is conducted in 3D space according to the extracted 3D feature. Our method captures the real 3D nature of contactless fingerprints as the whole feature extraction and matching algorithms are completed in real 3D space. Experiments results on contactless fingerprint databases show that the proposed method successfully improves the matching accuracy of contactless fingerprints. Exceptionally, our method performs stably across multiple poses of contactless fingerprints due to 3D graph matching, which is a great advantage compared to previous contactless fingerprint recognition algorithms. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08782v1-abstract-full').style.display = 'none'; document.getElementById('2409.08782v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.07055">arXiv:2409.07055</a> <span> [<a href="https://arxiv.org/pdf/2409.07055">pdf</a>, <a href="https://arxiv.org/format/2409.07055">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> Legal Fact Prediction: Task Definition and Dataset Construction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+J">Junkai Liu</a>, <a href="/search/cs?searchtype=author&query=Tong%2C+Y">Yujie Tong</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+H">Hui Huang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+S">Shuyuan Zheng</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+M">Muyun Yang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+P">Peicheng Wu</a>, <a href="/search/cs?searchtype=author&query=Onizuka%2C+M">Makoto Onizuka</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+C">Chuan Xiao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.07055v1-abstract-short" style="display: inline;"> Legal facts refer to the facts that can be proven by acknowledged evidence in a trial. They form the basis for the determination of court judgments. This paper introduces a novel NLP task: legal fact prediction, which aims to predict the legal fact based on a list of evidence. The predicted facts can instruct the parties and their lawyers involved in a trial to strengthen their submissions and opt… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.07055v1-abstract-full').style.display = 'inline'; document.getElementById('2409.07055v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.07055v1-abstract-full" style="display: none;"> Legal facts refer to the facts that can be proven by acknowledged evidence in a trial. They form the basis for the determination of court judgments. This paper introduces a novel NLP task: legal fact prediction, which aims to predict the legal fact based on a list of evidence. The predicted facts can instruct the parties and their lawyers involved in a trial to strengthen their submissions and optimize their strategies during the trial. Moreover, since real legal facts are difficult to obtain before the final judgment, the predicted facts also serve as an important basis for legal judgment prediction. We construct a benchmark dataset consisting of evidence lists and ground-truth legal facts for real civil loan cases, LFPLoan. Our experiments on this dataset show that this task is non-trivial and requires further considerable research efforts. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.07055v1-abstract-full').style.display = 'none'; document.getElementById('2409.07055v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.03757">arXiv:2409.03757</a> <span> [<a href="https://arxiv.org/pdf/2409.03757">pdf</a>, <a href="https://arxiv.org/format/2409.03757">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Lexicon3D: Probing Visual Foundation Models for Complex 3D Scene Understanding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Man%2C+Y">Yunze Man</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+S">Shuhong Zheng</a>, <a href="/search/cs?searchtype=author&query=Bao%2C+Z">Zhipeng Bao</a>, <a href="/search/cs?searchtype=author&query=Hebert%2C+M">Martial Hebert</a>, <a href="/search/cs?searchtype=author&query=Gui%2C+L">Liang-Yan Gui</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yu-Xiong Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.03757v2-abstract-short" style="display: inline;"> Complex 3D scene understanding has gained increasing attention, with scene encoding strategies playing a crucial role in this success. However, the optimal scene encoding strategies for various scenarios remain unclear, particularly compared to their image-based counterparts. To address this issue, we present a comprehensive study that probes various visual encoding models for 3D scene understandi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.03757v2-abstract-full').style.display = 'inline'; document.getElementById('2409.03757v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.03757v2-abstract-full" style="display: none;"> Complex 3D scene understanding has gained increasing attention, with scene encoding strategies playing a crucial role in this success. However, the optimal scene encoding strategies for various scenarios remain unclear, particularly compared to their image-based counterparts. To address this issue, we present a comprehensive study that probes various visual encoding models for 3D scene understanding, identifying the strengths and limitations of each model across different scenarios. Our evaluation spans seven vision foundation encoders, including image-based, video-based, and 3D foundation models. We evaluate these models in four tasks: Vision-Language Scene Reasoning, Visual Grounding, Segmentation, and Registration, each focusing on different aspects of scene understanding. Our evaluations yield key findings: DINOv2 demonstrates superior performance, video models excel in object-level tasks, diffusion models benefit geometric tasks, and language-pretrained models show unexpected limitations in language-related tasks. These insights challenge some conventional understandings, provide novel perspectives on leveraging visual foundation models, and highlight the need for more flexible encoder selection in future vision-language and scene-understanding tasks. Code: https://github.com/YunzeMan/Lexicon3D <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.03757v2-abstract-full').style.display = 'none'; document.getElementById('2409.03757v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2024. Project page: https://yunzeman.github.io/lexicon3d Github: https://github.com/YunzeMan/Lexicon3D</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.01787">arXiv:2409.01787</a> <span> [<a href="https://arxiv.org/pdf/2409.01787">pdf</a>, <a href="https://arxiv.org/format/2409.01787">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> LLM-GAN: Construct Generative Adversarial Network Through Large Language Models For Explainable Fake News Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yifeng Wang</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+Z">Zhouhong Gu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Siwei Zhang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+S">Suhang Zheng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+T">Tao Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+T">Tianyu Li</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+H">Hongwei Feng</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+Y">Yanghua Xiao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.01787v1-abstract-short" style="display: inline;"> Explainable fake news detection predicts the authenticity of news items with annotated explanations. Today, Large Language Models (LLMs) are known for their powerful natural language understanding and explanation generation abilities. However, presenting LLMs for explainable fake news detection remains two main challenges. Firstly, fake news appears reasonable and could easily mislead LLMs, leavin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.01787v1-abstract-full').style.display = 'inline'; document.getElementById('2409.01787v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.01787v1-abstract-full" style="display: none;"> Explainable fake news detection predicts the authenticity of news items with annotated explanations. Today, Large Language Models (LLMs) are known for their powerful natural language understanding and explanation generation abilities. However, presenting LLMs for explainable fake news detection remains two main challenges. Firstly, fake news appears reasonable and could easily mislead LLMs, leaving them unable to understand the complex news-faking process. Secondly, utilizing LLMs for this task would generate both correct and incorrect explanations, which necessitates abundant labor in the loop. In this paper, we propose LLM-GAN, a novel framework that utilizes prompting mechanisms to enable an LLM to become Generator and Detector and for realistic fake news generation and detection. Our results demonstrate LLM-GAN's effectiveness in both prediction performance and explanation quality. We further showcase the integration of LLM-GAN to a cloud-native AI platform to provide better fake news detection service in the cloud. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.01787v1-abstract-full').style.display = 'none'; document.getElementById('2409.01787v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.16532">arXiv:2408.16532</a> <span> [<a href="https://arxiv.org/pdf/2408.16532">pdf</a>, <a href="https://arxiv.org/format/2408.16532">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> WavTokenizer: an Efficient Acoustic Discrete Codec Tokenizer for Audio Language Modeling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ji%2C+S">Shengpeng Ji</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Ziyue Jiang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wen Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yifu Chen</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+M">Minghui Fang</a>, <a href="/search/cs?searchtype=author&query=Zuo%2C+J">Jialong Zuo</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Q">Qian Yang</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+X">Xize Cheng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zehan Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+R">Ruiqi Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Ziang Zhang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xiaoda Yang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+R">Rongjie Huang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Y">Yidi Jiang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Q">Qian Chen</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+S">Siqi Zheng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wen Wang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Zhou Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.16532v2-abstract-short" style="display: inline;"> Language models have been effectively applied to modeling natural signals, such as images, video, speech, and audio. A crucial component of these models is the codec tokenizer, which compresses high-dimensional natural signals into lower-dimensional discrete tokens. In this paper, we introduce WavTokenizer, which offers several advantages over previous SOTA acoustic codec models in the audio domai… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.16532v2-abstract-full').style.display = 'inline'; document.getElementById('2408.16532v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.16532v2-abstract-full" style="display: none;"> Language models have been effectively applied to modeling natural signals, such as images, video, speech, and audio. A crucial component of these models is the codec tokenizer, which compresses high-dimensional natural signals into lower-dimensional discrete tokens. In this paper, we introduce WavTokenizer, which offers several advantages over previous SOTA acoustic codec models in the audio domain: 1)extreme compression. By compressing the layers of quantizers and the temporal dimension of the discrete codec, one-second audio of 24kHz sampling rate requires only a single quantizer with 40 or 75 tokens. 2)improved subjective quality. Despite the reduced number of tokens, WavTokenizer achieves state-of-the-art reconstruction quality with outstanding UTMOS scores and inherently contains richer semantic information. Specifically, we achieve these results by designing a broader VQ space, extended contextual windows, and improved attention networks, as well as introducing a powerful multi-scale discriminator and an inverse Fourier transform structure. We conducted extensive reconstruction experiments in the domains of speech, audio, and music. WavTokenizer exhibited strong performance across various objective and subjective metrics compared to state-of-the-art models. We also tested semantic information, VQ utilization, and adaptability to generative models. Comprehensive ablation studies confirm the necessity of each module in WavTokenizer. The related code, demos, and pre-trained models are available at https://github.com/jishengpeng/WavTokenizer. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.16532v2-abstract-full').style.display = 'none'; document.getElementById('2408.16532v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Working in progress</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.16315">arXiv:2408.16315</a> <span> [<a href="https://arxiv.org/pdf/2408.16315">pdf</a>, <a href="https://arxiv.org/format/2408.16315">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Passenger hazard perception based on EEG signals for highly automated driving vehicles </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tan%2C+A+Y+X">Ashton Yu Xuan Tan</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yingkai Yang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiaofei Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Bowen Li</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+X">Xiaorong Gao</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+S">Sifa Zheng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jianqiang Wang</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+X">Xinyu Gu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jun Li</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yang Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yuxin Zhang</a>, <a href="/search/cs?searchtype=author&query=Stathaki%2C+T">Tania Stathaki</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.16315v1-abstract-short" style="display: inline;"> Enhancing the safety of autonomous vehicles is crucial, especially given recent accidents involving automated systems. As passengers in these vehicles, humans' sensory perception and decision-making can be integrated with autonomous systems to improve safety. This study explores neural mechanisms in passenger-vehicle interactions, leading to the development of a Passenger Cognitive Model (PCM) and… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.16315v1-abstract-full').style.display = 'inline'; document.getElementById('2408.16315v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.16315v1-abstract-full" style="display: none;"> Enhancing the safety of autonomous vehicles is crucial, especially given recent accidents involving automated systems. As passengers in these vehicles, humans' sensory perception and decision-making can be integrated with autonomous systems to improve safety. This study explores neural mechanisms in passenger-vehicle interactions, leading to the development of a Passenger Cognitive Model (PCM) and the Passenger EEG Decoding Strategy (PEDS). Central to PEDS is a novel Convolutional Recurrent Neural Network (CRNN) that captures spatial and temporal EEG data patterns. The CRNN, combined with stacking algorithms, achieves an accuracy of $85.0\% \pm 3.18\%$. Our findings highlight the predictive power of pre-event EEG data, enhancing the detection of hazardous scenarios and offering a network-driven framework for safer autonomous vehicles. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.16315v1-abstract-full').style.display = 'none'; document.getElementById('2408.16315v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.13659">arXiv:2408.13659</a> <span> [<a href="https://arxiv.org/pdf/2408.13659">pdf</a>, <a href="https://arxiv.org/format/2408.13659">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computational Engineering, Finance, and Science">cs.CE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Quantitative Methods">q-bio.QM</span> </div> </div> <p class="title is-5 mathjax"> ReactZyme: A Benchmark for Enzyme-Reaction Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hua%2C+C">Chenqing Hua</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+B">Bozitao Zhong</a>, <a href="/search/cs?searchtype=author&query=Luan%2C+S">Sitao Luan</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+L">Liang Hong</a>, <a href="/search/cs?searchtype=author&query=Wolf%2C+G">Guy Wolf</a>, <a href="/search/cs?searchtype=author&query=Precup%2C+D">Doina Precup</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+S">Shuangjia Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.13659v3-abstract-short" style="display: inline;"> Enzymes, with their specific catalyzed reactions, are necessary for all aspects of life, enabling diverse biological processes and adaptations. Predicting enzyme functions is essential for understanding biological pathways, guiding drug development, enhancing bioproduct yields, and facilitating evolutionary studies. Addressing the inherent complexities, we introduce a new approach to annotating en… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.13659v3-abstract-full').style.display = 'inline'; document.getElementById('2408.13659v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.13659v3-abstract-full" style="display: none;"> Enzymes, with their specific catalyzed reactions, are necessary for all aspects of life, enabling diverse biological processes and adaptations. Predicting enzyme functions is essential for understanding biological pathways, guiding drug development, enhancing bioproduct yields, and facilitating evolutionary studies. Addressing the inherent complexities, we introduce a new approach to annotating enzymes based on their catalyzed reactions. This method provides detailed insights into specific reactions and is adaptable to newly discovered reactions, diverging from traditional classifications by protein family or expert-derived reaction classes. We employ machine learning algorithms to analyze enzyme reaction datasets, delivering a much more refined view on the functionality of enzymes. Our evaluation leverages the largest enzyme-reaction dataset to date, derived from the SwissProt and Rhea databases with entries up to January 8, 2024. We frame the enzyme-reaction prediction as a retrieval problem, aiming to rank enzymes by their catalytic ability for specific reactions. With our model, we can recruit proteins for novel reactions and predict reactions in novel proteins, facilitating enzyme discovery and function annotation (https://github.com/WillHua127/ReactZyme). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.13659v3-abstract-full').style.display = 'none'; document.getElementById('2408.13659v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> 38th Conference on Neural Information Processing Systems (NeurIPS 2024) Track on Datasets and Benchmarks </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.12400">arXiv:2408.12400</a> <span> [<a href="https://arxiv.org/pdf/2408.12400">pdf</a>, <a href="https://arxiv.org/format/2408.12400">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Multi-Style Facial Sketch Synthesis through Masked Generative Modeling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+B">Bowen Sun</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+G">Guo Lu</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+S">Shibao Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.12400v1-abstract-short" style="display: inline;"> The facial sketch synthesis (FSS) model, capable of generating sketch portraits from given facial photographs, holds profound implications across multiple domains, encompassing cross-modal face recognition, entertainment, art, media, among others. However, the production of high-quality sketches remains a formidable task, primarily due to the challenges and flaws associated with three key factors:… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.12400v1-abstract-full').style.display = 'inline'; document.getElementById('2408.12400v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.12400v1-abstract-full" style="display: none;"> The facial sketch synthesis (FSS) model, capable of generating sketch portraits from given facial photographs, holds profound implications across multiple domains, encompassing cross-modal face recognition, entertainment, art, media, among others. However, the production of high-quality sketches remains a formidable task, primarily due to the challenges and flaws associated with three key factors: (1) the scarcity of artist-drawn data, (2) the constraints imposed by limited style types, and (3) the deficiencies of processing input information in existing models. To address these difficulties, we propose a lightweight end-to-end synthesis model that efficiently converts images to corresponding multi-stylized sketches, obviating the necessity for any supplementary inputs (\eg, 3D geometry). In this study, we overcome the issue of data insufficiency by incorporating semi-supervised learning into the training process. Additionally, we employ a feature extraction module and style embeddings to proficiently steer the generative transformer during the iterative prediction of masked image tokens, thus achieving a continuous stylized output that retains facial features accurately in sketches. The extensive experiments demonstrate that our method consistently outperforms previous algorithms across multiple benchmarks, exhibiting a discernible disparity. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.12400v1-abstract-full').style.display = 'none'; document.getElementById('2408.12400v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.12102">arXiv:2408.12102</a> <span> [<a href="https://arxiv.org/pdf/2408.12102">pdf</a>, <a href="https://arxiv.org/format/2408.12102">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Integrating Audio, Visual, and Semantic Information for Enhanced Multimodal Speaker Diarization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cheng%2C+L">Luyao Cheng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hui Wang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+S">Siqi Zheng</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yafeng Chen</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+R">Rongjie Huang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Qinglin Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Q">Qian Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xihao Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.12102v1-abstract-short" style="display: inline;"> Speaker diarization, the process of segmenting an audio stream or transcribed speech content into homogenous partitions based on speaker identity, plays a crucial role in the interpretation and analysis of human speech. Most existing speaker diarization systems rely exclusively on unimodal acoustic information, making the task particularly challenging due to the innate ambiguities of audio signals… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.12102v1-abstract-full').style.display = 'inline'; document.getElementById('2408.12102v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.12102v1-abstract-full" style="display: none;"> Speaker diarization, the process of segmenting an audio stream or transcribed speech content into homogenous partitions based on speaker identity, plays a crucial role in the interpretation and analysis of human speech. Most existing speaker diarization systems rely exclusively on unimodal acoustic information, making the task particularly challenging due to the innate ambiguities of audio signals. Recent studies have made tremendous efforts towards audio-visual or audio-semantic modeling to enhance performance. However, even the incorporation of up to two modalities often falls short in addressing the complexities of spontaneous and unstructured conversations. To exploit more meaningful dialogue patterns, we propose a novel multimodal approach that jointly utilizes audio, visual, and semantic cues to enhance speaker diarization. Our method elegantly formulates the multimodal modeling as a constrained optimization problem. First, we build insights into the visual connections among active speakers and the semantic interactions within spoken content, thereby establishing abundant pairwise constraints. Then we introduce a joint pairwise constraint propagation algorithm to cluster speakers based on these visual and semantic constraints. This integration effectively leverages the complementary strengths of different modalities, refining the affinity estimation between individual speaker embeddings. Extensive experiments conducted on multiple multimodal datasets demonstrate that our approach consistently outperforms state-of-the-art speaker diarization methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.12102v1-abstract-full').style.display = 'none'; document.getElementById('2408.12102v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.09933">arXiv:2408.09933</a> <span> [<a href="https://arxiv.org/pdf/2408.09933">pdf</a>, <a href="https://arxiv.org/format/2408.09933">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> SZU-AFS Antispoofing System for the ASVspoof 5 Challenge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yuxiong Xu</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+J">Jiafeng Zhong</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+S">Sengui Zheng</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zefeng Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Bin Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.09933v1-abstract-short" style="display: inline;"> This paper presents the SZU-AFS anti-spoofing system, designed for Track 1 of the ASVspoof 5 Challenge under open conditions. The system is built with four stages: selecting a baseline model, exploring effective data augmentation (DA) methods for fine-tuning, applying a co-enhancement strategy based on gradient norm aware minimization (GAM) for secondary fine-tuning, and fusing logits scores from… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.09933v1-abstract-full').style.display = 'inline'; document.getElementById('2408.09933v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.09933v1-abstract-full" style="display: none;"> This paper presents the SZU-AFS anti-spoofing system, designed for Track 1 of the ASVspoof 5 Challenge under open conditions. The system is built with four stages: selecting a baseline model, exploring effective data augmentation (DA) methods for fine-tuning, applying a co-enhancement strategy based on gradient norm aware minimization (GAM) for secondary fine-tuning, and fusing logits scores from the two best-performing fine-tuned models. The system utilizes the Wav2Vec2 front-end feature extractor and the AASIST back-end classifier as the baseline model. During model fine-tuning, three distinct DA policies have been investigated: single-DA, random-DA, and cascade-DA. Moreover, the employed GAM-based co-enhancement strategy, designed to fine-tune the augmented model at both data and optimizer levels, helps the Adam optimizer find flatter minima, thereby boosting model generalization. Overall, the final fusion system achieves a minDCF of 0.115 and an EER of 4.04% on the evaluation set. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.09933v1-abstract-full').style.display = 'none'; document.getElementById('2408.09933v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 2 figures, ASVspoof 5 Workshop (Interspeech2024 Satellite)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.05341">arXiv:2408.05341</a> <span> [<a href="https://arxiv.org/pdf/2408.05341">pdf</a>, <a href="https://arxiv.org/format/2408.05341">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> CAR: Contrast-Agnostic Deformable Medical Image Registration with Contrast-Invariant Latent Regularization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yinsong Wang</a>, <a href="/search/cs?searchtype=author&query=Du%2C+S">Siyi Du</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+S">Shaoming Zheng</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+X">Xinzhe Luo</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+C">Chen Qin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.05341v1-abstract-short" style="display: inline;"> Multi-contrast image registration is a challenging task due to the complex intensity relationships between different imaging contrasts. Conventional image registration methods are typically based on iterative optimizations for each input image pair, which is time-consuming and sensitive to contrast variations. While learning-based approaches are much faster during the inference stage, due to gener… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.05341v1-abstract-full').style.display = 'inline'; document.getElementById('2408.05341v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.05341v1-abstract-full" style="display: none;"> Multi-contrast image registration is a challenging task due to the complex intensity relationships between different imaging contrasts. Conventional image registration methods are typically based on iterative optimizations for each input image pair, which is time-consuming and sensitive to contrast variations. While learning-based approaches are much faster during the inference stage, due to generalizability issues, they typically can only be applied to the fixed contrasts observed during the training stage. In this work, we propose a novel contrast-agnostic deformable image registration framework that can be generalized to arbitrary contrast images, without observing them during training. Particularly, we propose a random convolution-based contrast augmentation scheme, which simulates arbitrary contrasts of images over a single image contrast while preserving their inherent structural information. To ensure that the network can learn contrast-invariant representations for facilitating contrast-agnostic registration, we further introduce contrast-invariant latent regularization (CLR) that regularizes representation in latent space through a contrast invariance loss. Experiments show that CAR outperforms the baseline approaches regarding registration accuracy and also possesses better generalization ability to unseen imaging contrasts. Code is available at \url{https://github.com/Yinsong0510/CAR}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.05341v1-abstract-full').style.display = 'none'; document.getElementById('2408.05341v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 3 figures, 3 tables, accecpted by WBIR 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.03194">arXiv:2408.03194</a> <span> [<a href="https://arxiv.org/pdf/2408.03194">pdf</a>, <a href="https://arxiv.org/format/2408.03194">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SGSR: Structure-Guided Multi-Contrast MRI Super-Resolution via Spatio-Frequency Co-Query Attention </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zheng%2C+S">Shaoming Zheng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yinsong Wang</a>, <a href="/search/cs?searchtype=author&query=Du%2C+S">Siyi Du</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+C">Chen Qin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.03194v1-abstract-short" style="display: inline;"> Magnetic Resonance Imaging (MRI) is a leading diagnostic modality for a wide range of exams, where multiple contrast images are often acquired for characterizing different tissues. However, acquiring high-resolution MRI typically extends scan time, which can introduce motion artifacts. Super-resolution of MRI therefore emerges as a promising approach to mitigate these challenges. Earlier studies h… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.03194v1-abstract-full').style.display = 'inline'; document.getElementById('2408.03194v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.03194v1-abstract-full" style="display: none;"> Magnetic Resonance Imaging (MRI) is a leading diagnostic modality for a wide range of exams, where multiple contrast images are often acquired for characterizing different tissues. However, acquiring high-resolution MRI typically extends scan time, which can introduce motion artifacts. Super-resolution of MRI therefore emerges as a promising approach to mitigate these challenges. Earlier studies have investigated the use of multiple contrasts for MRI super-resolution (MCSR), whereas majority of them did not fully exploit the rich contrast-invariant structural information. To fully utilize such crucial prior knowledge of multi-contrast MRI, in this work, we propose a novel structure-guided MCSR (SGSR) framework based on a new spatio-frequency co-query attention (CQA) mechanism. Specifically, CQA performs attention on features of multiple contrasts with a shared structural query, which is particularly designed to extract, fuse, and refine the common structures from different contrasts. We further propose a novel frequency-domain CQA module in addition to the spatial domain, to enable more fine-grained structural refinement. Extensive experiments on fastMRI knee data and low-field brain MRI show that SGSR outperforms state-of-the-art MCSR methods with statistical significance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.03194v1-abstract-full').style.display = 'none'; document.getElementById('2408.03194v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The 15th International Workshop on Machine Learning in Medical Imaging (MLMI 2024)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.03166">arXiv:2408.03166</a> <span> [<a href="https://arxiv.org/pdf/2408.03166">pdf</a>, <a href="https://arxiv.org/format/2408.03166">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> CADRL: Category-aware Dual-agent Reinforcement Learning for Explainable Recommendations over Knowledge Graphs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zheng%2C+S">Shangfei Zheng</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+H">Hongzhi Yin</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tong Chen</a>, <a href="/search/cs?searchtype=author&query=Kong%2C+X">Xiangjie Kong</a>, <a href="/search/cs?searchtype=author&query=Hou%2C+J">Jian Hou</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+P">Pengpeng Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.03166v1-abstract-short" style="display: inline;"> Knowledge graphs (KGs) have been widely adopted to mitigate data sparsity and address cold-start issues in recommender systems. While existing KGs-based recommendation methods can predict user preferences and demands, they fall short in generating explicit recommendation paths and lack explainability. As a step beyond the above methods, recent advancements utilize reinforcement learning (RL) to fi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.03166v1-abstract-full').style.display = 'inline'; document.getElementById('2408.03166v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.03166v1-abstract-full" style="display: none;"> Knowledge graphs (KGs) have been widely adopted to mitigate data sparsity and address cold-start issues in recommender systems. While existing KGs-based recommendation methods can predict user preferences and demands, they fall short in generating explicit recommendation paths and lack explainability. As a step beyond the above methods, recent advancements utilize reinforcement learning (RL) to find suitable items for a given user via explainable recommendation paths. However, the performance of these solutions is still limited by the following two points. (1) Lack of ability to capture contextual dependencies from neighboring information. (2) The excessive reliance on short recommendation paths due to efficiency concerns. To surmount these challenges, we propose a category-aware dual-agent reinforcement learning (CADRL) model for explainable recommendations over KGs. Specifically, our model comprises two components: (1) a category-aware gated graph neural network that jointly captures context-aware item representations from neighboring entities and categories, and (2) a dual-agent RL framework where two agents efficiently traverse long paths to search for suitable items. Finally, experimental results show that CADRL outperforms state-of-the-art models in terms of both effectiveness and efficiency on large-scale datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.03166v1-abstract-full').style.display = 'none'; document.getElementById('2408.03166v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.19512">arXiv:2407.19512</a> <span> [<a href="https://arxiv.org/pdf/2407.19512">pdf</a>, <a href="https://arxiv.org/format/2407.19512">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Large-scale cervical precancerous screening via AI-assisted cytology whole slide image analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+H">Honglin Li</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yusuan Sun</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+C">Chenglu Zhu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yunlong Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shichuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Shui%2C+Z">Zhongyi Shui</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+P">Pingyi Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jingxiong Li</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+S">Sunyi Zheng</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+C">Can Cui</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+L">Lin Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.19512v1-abstract-short" style="display: inline;"> Cervical Cancer continues to be the leading gynecological malignancy, posing a persistent threat to women's health on a global scale. Early screening via cytology Whole Slide Image (WSI) diagnosis is critical to prevent this Cancer progression and improve survival rate, but pathologist's single test suffers inevitable false negative due to the immense number of cells that need to be reviewed withi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.19512v1-abstract-full').style.display = 'inline'; document.getElementById('2407.19512v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.19512v1-abstract-full" style="display: none;"> Cervical Cancer continues to be the leading gynecological malignancy, posing a persistent threat to women's health on a global scale. Early screening via cytology Whole Slide Image (WSI) diagnosis is critical to prevent this Cancer progression and improve survival rate, but pathologist's single test suffers inevitable false negative due to the immense number of cells that need to be reviewed within a WSI. Though computer-aided automated diagnostic models can serve as strong complement for pathologists, their effectiveness is hampered by the paucity of extensive and detailed annotations, coupled with the limited interpretability and robustness. These factors significantly hinder their practical applicability and reliability in clinical settings. To tackle these challenges, we develop an AI approach, which is a Scalable Technology for Robust and Interpretable Diagnosis built on Extensive data (STRIDE) of cervical cytology. STRIDE addresses the bottleneck of limited annotations by integrating patient-level labels with a small portion of cell-level labels through an end-to-end training strategy, facilitating scalable learning across extensive datasets. To further improve the robustness to real-world domain shifts of cytology slide-making and imaging, STRIDE employs color adversarial samples training that mimic staining and imaging variations. Lastly, to achieve pathologist-level interpretability for the trustworthiness in clinical settings, STRIDE can generate explanatory textual descriptions that simulates pathologists' diagnostic processes by cell image feature and textual description alignment. Conducting extensive experiments and evaluations in 183 medical centers with a dataset of 341,889 WSIs and 0.1 billion cells from cervical cytology patients, STRIDE has demonstrated a remarkable superiority over previous state-of-the-art techniques. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.19512v1-abstract-full').style.display = 'none'; document.getElementById('2407.19512v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.10782">arXiv:2407.10782</a> <span> [<a href="https://arxiv.org/pdf/2407.10782">pdf</a>, <a href="https://arxiv.org/format/2407.10782">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> LVCP: LiDAR-Vision Tightly Coupled Collaborative Real-time Relative Positioning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jian%2C+Z">Zhuozhu Jian</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Q">Qixuan Li</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+S">Shengtao Zheng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xueqian Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xinlei Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.10782v1-abstract-short" style="display: inline;"> In air-ground collaboration scenarios without GPS and prior maps, the relative positioning of drones and unmanned ground vehicles (UGVs) has always been a challenge. For a drone equipped with monocular camera and an UGV equipped with LiDAR as an external sensor, we propose a robust and real-time relative pose estimation method (LVCP) based on the tight coupling of vision and LiDAR point cloud info… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.10782v1-abstract-full').style.display = 'inline'; document.getElementById('2407.10782v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.10782v1-abstract-full" style="display: none;"> In air-ground collaboration scenarios without GPS and prior maps, the relative positioning of drones and unmanned ground vehicles (UGVs) has always been a challenge. For a drone equipped with monocular camera and an UGV equipped with LiDAR as an external sensor, we propose a robust and real-time relative pose estimation method (LVCP) based on the tight coupling of vision and LiDAR point cloud information, which does not require prior information such as maps or precise initial poses. Given that large-scale point clouds generated by 3D sensors has more accurate spatial geometric information than the feature point cloud generated by image, we utilize LiDAR point clouds to correct the drift in visual-inertial odometry (VIO) when the camera undergoes significant shaking or the IMU has a low signal-to-noise ratio. To achieve this, we propose a novel coarse-to-fine framework for LiDAR-vision collaborative localization. In this framework, we construct point-plane association based on spatial geometric information, and innovatively construct a point-aided Bundle Adjustment (BA) problem as the backend to simultaneously estimate the relative pose of the camera and LiDAR and correct the VIO drift. In this process, we propose a particle swarm optimization (PSO) based sampling algorithm to complete the coarse estimation of the current camera-LiDAR pose. In this process, the initial pose of the camera used for sampling is obtained based on VIO propagation, and the valid feature-plane association number (VFPN) is used to trigger PSO-sampling process. Additionally, we propose a method that combines Structure from Motion (SFM) and multi-level sampling to initialize the algorithm, addressing the challenge of lacking initial values. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.10782v1-abstract-full').style.display = 'none'; document.getElementById('2407.10782v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">See more details in https://sites.google.com/view/lvcp</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.09774">arXiv:2407.09774</a> <span> [<a href="https://arxiv.org/pdf/2407.09774">pdf</a>, <a href="https://arxiv.org/format/2407.09774">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> ContextualStory: Consistent Visual Storytelling with Spatially-Enhanced and Storyline Context </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zheng%2C+S">Sixiao Zheng</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+Y">Yanwei Fu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.09774v2-abstract-short" style="display: inline;"> Visual storytelling involves generating a sequence of coherent frames from a textual storyline while maintaining consistency in characters and scenes. Existing autoregressive methods, which rely on previous frame-sentence pairs, struggle with high memory usage, slow generation speeds, and limited context integration. To address these issues, we propose ContextualStory, a novel framework designed t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.09774v2-abstract-full').style.display = 'inline'; document.getElementById('2407.09774v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.09774v2-abstract-full" style="display: none;"> Visual storytelling involves generating a sequence of coherent frames from a textual storyline while maintaining consistency in characters and scenes. Existing autoregressive methods, which rely on previous frame-sentence pairs, struggle with high memory usage, slow generation speeds, and limited context integration. To address these issues, we propose ContextualStory, a novel framework designed to generate coherent story frames and extend frames for story continuation. ContextualStory utilizes Spatially-Enhanced Temporal Attention to capture spatial and temporal dependencies, handling significant character movements effectively. Additionally, we introduces a Storyline Contextualizer to enrich context in storyline embedding and a StoryFlow Adapter to measure scene changes between frames for guiding model. Extensive experiments on PororoSV and FlintstonesSV benchmarks demonstrate that ContextualStory significantly outperforms existing methods in both story visualization and story continuation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.09774v2-abstract-full').style.display = 'none'; document.getElementById('2407.09774v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.08234">arXiv:2407.08234</a> <span> [<a href="https://arxiv.org/pdf/2407.08234">pdf</a>, <a href="https://arxiv.org/format/2407.08234">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Model Predictive Control For Mobile Manipulators Based On Neural Dynamics(Extended version) </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Su%2C+T">Tao Su</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+S">Shiqi Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.08234v1-abstract-short" style="display: inline;"> This article focuses on the trajectory tracking problem of mobile manipulators (MMs). Firstly, we construct a position and orientation model predictive tracking control (POMPTC) scheme for mobile manipulators. The proposed POMPTC scheme can simultaneously minimize the tracking error, joint velocity, and joint acceleration. Moreover, it can achieve synchronous control for the position and orientati… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.08234v1-abstract-full').style.display = 'inline'; document.getElementById('2407.08234v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.08234v1-abstract-full" style="display: none;"> This article focuses on the trajectory tracking problem of mobile manipulators (MMs). Firstly, we construct a position and orientation model predictive tracking control (POMPTC) scheme for mobile manipulators. The proposed POMPTC scheme can simultaneously minimize the tracking error, joint velocity, and joint acceleration. Moreover, it can achieve synchronous control for the position and orientation of the end-effector. Secondly, a finite-time convergent neural dynamics (FTCND) model is constructed to find the optimal solution of the POMPTC scheme. Then, based on the proposed POMPTC scheme, a non-singular fast terminal sliding model (NFTSM) control method is presented, which considers the disturbances caused by the base motion on the manipulator at the dynamic level. It can achieve finite-time tracking performance and improve the anti-disturbances ability. Finally, simulation and experiments show that the proposed control method has the advantages of strong robustness, fast convergence, and high control accuracy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.08234v1-abstract-full').style.display = 'none'; document.getElementById('2407.08234v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This article consists of 13 pages, including the text and the proof process</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.08223">arXiv:2407.08223</a> <span> [<a href="https://arxiv.org/pdf/2407.08223">pdf</a>, <a href="https://arxiv.org/format/2407.08223">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Speculative RAG: Enhancing Retrieval Augmented Generation through Drafting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zilong Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zifeng Wang</a>, <a href="/search/cs?searchtype=author&query=Le%2C+L">Long Le</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+H+S">Huaixiu Steven Zheng</a>, <a href="/search/cs?searchtype=author&query=Mishra%2C+S">Swaroop Mishra</a>, <a href="/search/cs?searchtype=author&query=Perot%2C+V">Vincent Perot</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yuwei Zhang</a>, <a href="/search/cs?searchtype=author&query=Mattapalli%2C+A">Anush Mattapalli</a>, <a href="/search/cs?searchtype=author&query=Taly%2C+A">Ankur Taly</a>, <a href="/search/cs?searchtype=author&query=Shang%2C+J">Jingbo Shang</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+C">Chen-Yu Lee</a>, <a href="/search/cs?searchtype=author&query=Pfister%2C+T">Tomas Pfister</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.08223v1-abstract-short" style="display: inline;"> Retrieval augmented generation (RAG) combines the generative abilities of large language models (LLMs) with external knowledge sources to provide more accurate and up-to-date responses. Recent RAG advancements focus on improving retrieval outcomes through iterative LLM refinement or self-critique capabilities acquired through additional instruction tuning of LLMs. In this work, we introduce Specul… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.08223v1-abstract-full').style.display = 'inline'; document.getElementById('2407.08223v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.08223v1-abstract-full" style="display: none;"> Retrieval augmented generation (RAG) combines the generative abilities of large language models (LLMs) with external knowledge sources to provide more accurate and up-to-date responses. Recent RAG advancements focus on improving retrieval outcomes through iterative LLM refinement or self-critique capabilities acquired through additional instruction tuning of LLMs. In this work, we introduce Speculative RAG - a framework that leverages a larger generalist LM to efficiently verify multiple RAG drafts produced in parallel by a smaller, distilled specialist LM. Each draft is generated from a distinct subset of retrieved documents, offering diverse perspectives on the evidence while reducing input token counts per draft. This approach enhances comprehension of each subset and mitigates potential position bias over long context. Our method accelerates RAG by delegating drafting to the smaller specialist LM, with the larger generalist LM performing a single verification pass over the drafts. Extensive experiments demonstrate that Speculative RAG achieves state-of-the-art performance with reduced latency on TriviaQA, MuSiQue, PubHealth, and ARC-Challenge benchmarks. It notably enhances accuracy by up to 12.97% while reducing latency by 51% compared to conventional RAG systems on PubHealth. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.08223v1-abstract-full').style.display = 'none'; document.getElementById('2407.08223v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Preprint</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.07582">arXiv:2407.07582</a> <span> [<a href="https://arxiv.org/pdf/2407.07582">pdf</a>, <a href="https://arxiv.org/format/2407.07582">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> TIP: Tabular-Image Pre-training for Multimodal Classification with Incomplete Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Du%2C+S">Siyi Du</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+S">Shaoming Zheng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yinsong Wang</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+W">Wenjia Bai</a>, <a href="/search/cs?searchtype=author&query=O%27Regan%2C+D+P">Declan P. O'Regan</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+C">Chen Qin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.07582v1-abstract-short" style="display: inline;"> Images and structured tables are essential parts of real-world databases. Though tabular-image representation learning is promising to create new insights, it remains a challenging task, as tabular data is typically heterogeneous and incomplete, presenting significant modality disparities with images. Earlier works have mainly focused on simple modality fusion strategies in complete data scenarios… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.07582v1-abstract-full').style.display = 'inline'; document.getElementById('2407.07582v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.07582v1-abstract-full" style="display: none;"> Images and structured tables are essential parts of real-world databases. Though tabular-image representation learning is promising to create new insights, it remains a challenging task, as tabular data is typically heterogeneous and incomplete, presenting significant modality disparities with images. Earlier works have mainly focused on simple modality fusion strategies in complete data scenarios, without considering the missing data issue, and thus are limited in practice. In this paper, we propose TIP, a novel tabular-image pre-training framework for learning multimodal representations robust to incomplete tabular data. Specifically, TIP investigates a novel self-supervised learning (SSL) strategy, including a masked tabular reconstruction task for tackling data missingness, and image-tabular matching and contrastive learning objectives to capture multimodal information. Moreover, TIP proposes a versatile tabular encoder tailored for incomplete, heterogeneous tabular data and a multimodal interaction module for inter-modality representation learning. Experiments are performed on downstream multimodal classification tasks using both natural and medical image datasets. The results show that TIP outperforms state-of-the-art supervised/SSL image/multimodal algorithms in both complete and incomplete data scenarios. Our code is available at https://github.com/siyi-wind/TIP. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.07582v1-abstract-full').style.display = 'none'; document.getElementById('2407.07582v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">28 pages (including 9 pages of supplementary materials), accepted by ECCV 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.05603">arXiv:2407.05603</a> <span> [<a href="https://arxiv.org/pdf/2407.05603">pdf</a>, <a href="https://arxiv.org/format/2407.05603">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> WSI-VQA: Interpreting Whole Slide Images by Generative Visual Question Answering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+P">Pingyi Chen</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+C">Chenglu Zhu</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+S">Sunyi Zheng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Honglin Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+L">Lin Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.05603v1-abstract-short" style="display: inline;"> Whole slide imaging is routinely adopted for carcinoma diagnosis and prognosis. Abundant experience is required for pathologists to achieve accurate and reliable diagnostic results of whole slide images (WSI). The huge size and heterogeneous features of WSIs make the workflow of pathological reading extremely time-consuming. In this paper, we propose a novel framework (WSI-VQA) to interpret WSIs b… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.05603v1-abstract-full').style.display = 'inline'; document.getElementById('2407.05603v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.05603v1-abstract-full" style="display: none;"> Whole slide imaging is routinely adopted for carcinoma diagnosis and prognosis. Abundant experience is required for pathologists to achieve accurate and reliable diagnostic results of whole slide images (WSI). The huge size and heterogeneous features of WSIs make the workflow of pathological reading extremely time-consuming. In this paper, we propose a novel framework (WSI-VQA) to interpret WSIs by generative visual question answering. WSI-VQA shows universality by reframing various kinds of slide-level tasks in a question-answering pattern, in which pathologists can achieve immunohistochemical grading, survival prediction, and tumor subtyping following human-machine interaction. Furthermore, we establish a WSI-VQA dataset which contains 8672 slide-level question-answering pairs with 977 WSIs. Besides the ability to deal with different slide-level tasks, our generative model which is named Wsi2Text Transformer (W2T) outperforms existing discriminative models in medical correctness, which reveals the potential of our model to be applied in the clinical scenario. Additionally, we also visualize the co-attention mapping between word embeddings and WSIs as an intuitive explanation for diagnostic results. The dataset and related code are available at https://github.com/cpystan/WSI-VQA. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.05603v1-abstract-full').style.display = 'none'; document.getElementById('2407.05603v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at ECCV 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.05407">arXiv:2407.05407</a> <span> [<a href="https://arxiv.org/pdf/2407.05407">pdf</a>, <a href="https://arxiv.org/format/2407.05407">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> CosyVoice: A Scalable Multilingual Zero-shot Text-to-speech Synthesizer based on Supervised Semantic Tokens </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Du%2C+Z">Zhihao Du</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Q">Qian Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shiliang Zhang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+K">Kai Hu</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Heng Lu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yexin Yang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+H">Hangrui Hu</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+S">Siqi Zheng</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+Y">Yue Gu</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Z">Ziyang Ma</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Zhifu Gao</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+Z">Zhijie Yan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.05407v2-abstract-short" style="display: inline;"> Recent years have witnessed a trend that large language model (LLM) based text-to-speech (TTS) emerges into the mainstream due to their high naturalness and zero-shot capacity. In this paradigm, speech signals are discretized into token sequences, which are modeled by an LLM with text as prompts and reconstructed by a token-based vocoder to waveforms. Obviously, speech tokens play a critical role… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.05407v2-abstract-full').style.display = 'inline'; document.getElementById('2407.05407v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.05407v2-abstract-full" style="display: none;"> Recent years have witnessed a trend that large language model (LLM) based text-to-speech (TTS) emerges into the mainstream due to their high naturalness and zero-shot capacity. In this paradigm, speech signals are discretized into token sequences, which are modeled by an LLM with text as prompts and reconstructed by a token-based vocoder to waveforms. Obviously, speech tokens play a critical role in LLM-based TTS models. Current speech tokens are learned in an unsupervised manner, which lacks explicit semantic information and alignment to the text. In this paper, we propose to represent speech with supervised semantic tokens, which are derived from a multilingual speech recognition model by inserting vector quantization into the encoder. Based on the tokens, we further propose a scalable zero-shot TTS synthesizer, CosyVoice, which consists of an LLM for text-to-token generation and a conditional flow matching model for token-to-speech synthesis. Experimental results show that supervised semantic tokens significantly outperform existing unsupervised tokens in terms of content consistency and speaker similarity for zero-shot voice cloning. Moreover, we find that utilizing large-scale data further improves the synthesis performance, indicating the scalable capacity of CosyVoice. To the best of our knowledge, this is the first attempt to involve supervised speech tokens into TTS models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.05407v2-abstract-full').style.display = 'none'; document.getElementById('2407.05407v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">work in progress. arXiv admin note: substantial text overlap with arXiv:2407.04051</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.05364">arXiv:2407.05364</a> <span> [<a href="https://arxiv.org/pdf/2407.05364">pdf</a>, <a href="https://arxiv.org/format/2407.05364">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> PTaRL: Prototype-based Tabular Representation Learning via Space Calibration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ye%2C+H">Hangting Ye</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+W">Wei Fan</a>, <a href="/search/cs?searchtype=author&query=Song%2C+X">Xiaozhuang Song</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+S">Shun Zheng</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+H">He Zhao</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+D">Dandan Guo</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+Y">Yi Chang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.05364v2-abstract-short" style="display: inline;"> Tabular data have been playing a mostly important role in diverse real-world fields, such as healthcare, engineering, finance, etc. With the recent success of deep learning, many tabular machine learning (ML) methods based on deep networks (e.g., Transformer, ResNet) have achieved competitive performance on tabular benchmarks. However, existing deep tabular ML methods suffer from the representatio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.05364v2-abstract-full').style.display = 'inline'; document.getElementById('2407.05364v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.05364v2-abstract-full" style="display: none;"> Tabular data have been playing a mostly important role in diverse real-world fields, such as healthcare, engineering, finance, etc. With the recent success of deep learning, many tabular machine learning (ML) methods based on deep networks (e.g., Transformer, ResNet) have achieved competitive performance on tabular benchmarks. However, existing deep tabular ML methods suffer from the representation entanglement and localization, which largely hinders their prediction performance and leads to performance inconsistency on tabular tasks. To overcome these problems, we explore a novel direction of applying prototype learning for tabular ML and propose a prototype-based tabular representation learning framework, PTaRL, for tabular prediction tasks. The core idea of PTaRL is to construct prototype-based projection space (P-Space) and learn the disentangled representation around global data prototypes. Specifically, PTaRL mainly involves two stages: (i) Prototype Generation, that constructs global prototypes as the basis vectors of P-Space for representation, and (ii) Prototype Projection, that projects the data samples into P-Space and keeps the core global data information via Optimal Transport. Then, to further acquire the disentangled representations, we constrain PTaRL with two strategies: (i) to diversify the coordinates towards global prototypes of different representations within P-Space, we bring up a diversification constraint for representation calibration; (ii) to avoid prototype entanglement in P-Space, we introduce a matrix orthogonalization constraint to ensure the independence of global prototypes. Finally, we conduct extensive experiments in PTaRL coupled with state-of-the-art deep tabular ML models on various tabular benchmarks and the results have shown our consistent superiority. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.05364v2-abstract-full').style.display = 'none'; document.getElementById('2407.05364v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICLR 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.04379">arXiv:2407.04379</a> <span> [<a href="https://arxiv.org/pdf/2407.04379">pdf</a>, <a href="https://arxiv.org/format/2407.04379">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> A Mapping Strategy for Interacting with Latent Audio Synthesis Using Artistic Materials </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zheng%2C+S">Shuoyang Zheng</a>, <a href="/search/cs?searchtype=author&query=Sed%C3%B3%2C+A+X">Anna Xamb贸 Sed贸</a>, <a href="/search/cs?searchtype=author&query=Bryan-Kinns%2C+N">Nick Bryan-Kinns</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.04379v1-abstract-short" style="display: inline;"> This paper presents a mapping strategy for interacting with the latent spaces of generative AI models. Our approach involves using unsupervised feature learning to encode a human control space and mapping it to an audio synthesis model's latent space. To demonstrate how this mapping strategy can turn high-dimensional sensor data into control mechanisms of a deep generative model, we present a proo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.04379v1-abstract-full').style.display = 'inline'; document.getElementById('2407.04379v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.04379v1-abstract-full" style="display: none;"> This paper presents a mapping strategy for interacting with the latent spaces of generative AI models. Our approach involves using unsupervised feature learning to encode a human control space and mapping it to an audio synthesis model's latent space. To demonstrate how this mapping strategy can turn high-dimensional sensor data into control mechanisms of a deep generative model, we present a proof-of-concept system that uses visual sketches to control an audio synthesis model. We draw on emerging discourses in XAIxArts to discuss how this approach can contribute to XAI in artistic and creative contexts, we also discuss its current limitations and propose future research directions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.04379v1-abstract-full').style.display = 'none'; document.getElementById('2407.04379v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Report number:</span> XAIxArts/2024/10 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.04051">arXiv:2407.04051</a> <span> [<a href="https://arxiv.org/pdf/2407.04051">pdf</a>, <a href="https://arxiv.org/format/2407.04051">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> FunAudioLLM: Voice Understanding and Generation Foundation Models for Natural Interaction Between Humans and LLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=An%2C+K">Keyu An</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Q">Qian Chen</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+C">Chong Deng</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Z">Zhihao Du</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+C">Changfeng Gao</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Zhifu Gao</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+Y">Yue Gu</a>, <a href="/search/cs?searchtype=author&query=He%2C+T">Ting He</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+H">Hangrui Hu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+K">Kai Hu</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+S">Shengpeng Ji</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yabin Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zerui Li</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Heng Lu</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+H">Haoneng Luo</a>, <a href="/search/cs?searchtype=author&query=Lv%2C+X">Xiang Lv</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+B">Bin Ma</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Z">Ziyang Ma</a>, <a href="/search/cs?searchtype=author&query=Ni%2C+C">Chongjia Ni</a>, <a href="/search/cs?searchtype=author&query=Song%2C+C">Changhe Song</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+J">Jiaqi Shi</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+X">Xian Shi</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hao Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wen Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yuxuan Wang</a> , et al. (8 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.04051v3-abstract-short" style="display: inline;"> This report introduces FunAudioLLM, a model family designed to enhance natural voice interactions between humans and large language models (LLMs). At its core are two innovative models: SenseVoice, which handles multilingual speech recognition, emotion recognition, and audio event detection; and CosyVoice, which facilitates natural speech generation with control over multiple languages, timbre, sp… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.04051v3-abstract-full').style.display = 'inline'; document.getElementById('2407.04051v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.04051v3-abstract-full" style="display: none;"> This report introduces FunAudioLLM, a model family designed to enhance natural voice interactions between humans and large language models (LLMs). At its core are two innovative models: SenseVoice, which handles multilingual speech recognition, emotion recognition, and audio event detection; and CosyVoice, which facilitates natural speech generation with control over multiple languages, timbre, speaking style, and speaker identity. SenseVoice-Small delivers exceptionally low-latency ASR for 5 languages, and SenseVoice-Large supports high-precision ASR for over 50 languages, while CosyVoice excels in multi-lingual voice generation, zero-shot in-context learning, cross-lingual voice cloning, and instruction-following capabilities. The models related to SenseVoice and CosyVoice have been open-sourced on Modelscope and Huggingface, along with the corresponding training, inference, and fine-tuning codes released on GitHub. By integrating these models with LLMs, FunAudioLLM enables applications such as speech-to-speech translation, emotional voice chat, interactive podcasts, and expressive audiobook narration, thereby pushing the boundaries of voice interaction technology. Demos are available at https://fun-audio-llm.github.io, and the code can be accessed at https://github.com/FunAudioLLM. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.04051v3-abstract-full').style.display = 'none'; document.getElementById('2407.04051v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Work in progress. Authors are listed in alphabetical order by family name</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.02833">arXiv:2407.02833</a> <span> [<a href="https://arxiv.org/pdf/2407.02833">pdf</a>, <a href="https://arxiv.org/format/2407.02833">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> LANE: Logic Alignment of Non-tuning Large Language Models and Online Recommendation Systems for Explainable Reason Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Hongke Zhao</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+S">Songming Zheng</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+L">Likang Wu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+B">Bowen Yu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jing Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.02833v1-abstract-short" style="display: inline;"> The explainability of recommendation systems is crucial for enhancing user trust and satisfaction. Leveraging large language models (LLMs) offers new opportunities for comprehensive recommendation logic generation. However, in existing related studies, fine-tuning LLM models for recommendation tasks incurs high computational costs and alignment issues with existing systems, limiting the applicatio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.02833v1-abstract-full').style.display = 'inline'; document.getElementById('2407.02833v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.02833v1-abstract-full" style="display: none;"> The explainability of recommendation systems is crucial for enhancing user trust and satisfaction. Leveraging large language models (LLMs) offers new opportunities for comprehensive recommendation logic generation. However, in existing related studies, fine-tuning LLM models for recommendation tasks incurs high computational costs and alignment issues with existing systems, limiting the application potential of proven proprietary/closed-source LLM models, such as GPT-4. In this work, our proposed effective strategy LANE aligns LLMs with online recommendation systems without additional LLMs tuning, reducing costs and improving explainability. This innovative approach addresses key challenges in integrating language models with recommendation systems while fully utilizing the capabilities of powerful proprietary models. Specifically, our strategy operates through several key components: semantic embedding, user multi-preference extraction using zero-shot prompting, semantic alignment, and explainable recommendation generation using Chain of Thought (CoT) prompting. By embedding item titles instead of IDs and utilizing multi-head attention mechanisms, our approach aligns the semantic features of user preferences with those of candidate items, ensuring coherent and user-aligned recommendations. Sufficient experimental results including performance comparison, questionnaire voting, and visualization cases prove that our method can not only ensure recommendation performance, but also provide easy-to-understand and reasonable recommendation logic. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.02833v1-abstract-full').style.display = 'none'; document.getElementById('2407.02833v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.02049">arXiv:2407.02049</a> <span> [<a href="https://arxiv.org/pdf/2407.02049">pdf</a>, <a href="https://arxiv.org/format/2407.02049">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Accompanied Singing Voice Synthesis with Fully Text-controlled Melody </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+R">Ruiqi Li</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+Z">Zhiqing Hong</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yongqi Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Lichao Zhang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+R">Rongjie Huang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+S">Siqi Zheng</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Zhou Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.02049v1-abstract-short" style="display: inline;"> Text-to-song (TTSong) is a music generation task that synthesizes accompanied singing voices. Current TTSong methods, inherited from singing voice synthesis (SVS), require melody-related information that can sometimes be impractical, such as music scores or MIDI sequences. We present MelodyLM, the first TTSong model that generates high-quality song pieces with fully text-controlled melodies, achie… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.02049v1-abstract-full').style.display = 'inline'; document.getElementById('2407.02049v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.02049v1-abstract-full" style="display: none;"> Text-to-song (TTSong) is a music generation task that synthesizes accompanied singing voices. Current TTSong methods, inherited from singing voice synthesis (SVS), require melody-related information that can sometimes be impractical, such as music scores or MIDI sequences. We present MelodyLM, the first TTSong model that generates high-quality song pieces with fully text-controlled melodies, achieving minimal user requirements and maximum control flexibility. MelodyLM explicitly models MIDI as the intermediate melody-related feature and sequentially generates vocal tracks in a language model manner, conditioned on textual and vocal prompts. The accompaniment music is subsequently synthesized by a latent diffusion model with hybrid conditioning for temporal alignment. With minimal requirements, users only need to input lyrics and a reference voice to synthesize a song sample. For full control, just input textual prompts or even directly input MIDI. Experimental results indicate that MelodyLM achieves superior performance in terms of both objective and subjective metrics. Audio samples are available at https://melodylm666.github.io. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.02049v1-abstract-full').style.display = 'none'; document.getElementById('2407.02049v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Working in progress</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Zheng%2C+S&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Zheng%2C+S&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Zheng%2C+S&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Zheng%2C+S&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Zheng%2C+S&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Zheng%2C+S&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>