CINXE.COM

Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1&ndash;50 of 3,099 results for author: <span class="mathjax">Li, C</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&amp;query=Li%2C+C">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Li, C"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Li%2C+C&amp;terms-0-field=author&amp;size=50&amp;order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Li, C"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Li%2C+C&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Li%2C+C&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Li%2C+C&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Li%2C+C&amp;start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Li%2C+C&amp;start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Li%2C+C&amp;start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">&hellip;</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.18499">arXiv:2411.18499</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.18499">pdf</a>, <a href="https://arxiv.org/format/2411.18499">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> GATE OpenING: A Comprehensive Benchmark for Judging Open-ended Interleaved Image-Text Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+P">Pengfei Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Peng%2C+X">Xiaopeng Peng</a>, <a href="/search/cs?searchtype=author&amp;query=Song%2C+J">Jiajun Song</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+C">Chuanhao Li</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+Z">Zhaopan Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+Y">Yue Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+Z">Ziyao Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+H">Hao Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+Y">Yuqi Lin</a>, <a href="/search/cs?searchtype=author&amp;query=He%2C+Y">Yefei He</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+L">Lirui Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+S">Shuo Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+T">Tianhua Li</a>, <a href="/search/cs?searchtype=author&amp;query=Xie%2C+Y">Yuxuan Xie</a>, <a href="/search/cs?searchtype=author&amp;query=Chang%2C+X">Xiaojun Chang</a>, <a href="/search/cs?searchtype=author&amp;query=Qiao%2C+Y">Yu Qiao</a>, <a href="/search/cs?searchtype=author&amp;query=Shao%2C+W">Wenqi Shao</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+K">Kaipeng Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.18499v1-abstract-short" style="display: inline;"> Multimodal Large Language Models (MLLMs) have made significant strides in visual understanding and generation tasks. However, generating interleaved image-text content remains a challenge, which requires integrated multimodal understanding and generation abilities. While the progress in unified models offers new solutions, existing benchmarks are insufficient for evaluating these methods due to da&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18499v1-abstract-full').style.display = 'inline'; document.getElementById('2411.18499v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.18499v1-abstract-full" style="display: none;"> Multimodal Large Language Models (MLLMs) have made significant strides in visual understanding and generation tasks. However, generating interleaved image-text content remains a challenge, which requires integrated multimodal understanding and generation abilities. While the progress in unified models offers new solutions, existing benchmarks are insufficient for evaluating these methods due to data size and diversity limitations. To bridge this gap, we introduce GATE OpenING (OpenING), a comprehensive benchmark comprising 5,400 high-quality human-annotated instances across 56 real-world tasks. OpenING covers diverse daily scenarios such as travel guide, design, and brainstorming, offering a robust platform for challenging interleaved generation methods. In addition, we present IntJudge, a judge model for evaluating open-ended multimodal generation methods. Trained with a novel data pipeline, our IntJudge achieves an agreement rate of 82. 42% with human judgments, outperforming GPT-based evaluators by 11.34%. Extensive experiments on OpenING reveal that current interleaved generation methods still have substantial room for improvement. Key findings on interleaved image-text generation are further presented to guide the development of next-generation models. The OpenING is open-sourced at https://opening.github.io. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18499v1-abstract-full').style.display = 'none'; document.getElementById('2411.18499v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">53 pages, 19 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.18271">arXiv:2411.18271</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.18271">pdf</a>, <a href="https://arxiv.org/format/2411.18271">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> </div> </div> <p class="title is-5 mathjax"> Efficient Nonlinear Function Approximation in Analog Resistive Crossbars for Recurrent Neural Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yang%2C+J">Junyi Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Mao%2C+R">Ruibin Mao</a>, <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+M">Mingrui Jiang</a>, <a href="/search/cs?searchtype=author&amp;query=Cheng%2C+Y">Yichuan Cheng</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+P+V">Pao-Sheng Vincent Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Dong%2C+S">Shuai Dong</a>, <a href="/search/cs?searchtype=author&amp;query=Pedretti%2C+G">Giacomo Pedretti</a>, <a href="/search/cs?searchtype=author&amp;query=Sheng%2C+X">Xia Sheng</a>, <a href="/search/cs?searchtype=author&amp;query=Ignowski%2C+J">Jim Ignowski</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+H">Haoliang Li</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+C">Can Li</a>, <a href="/search/cs?searchtype=author&amp;query=Basu%2C+A">Arindam Basu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.18271v1-abstract-short" style="display: inline;"> Analog In-memory Computing (IMC) has demonstrated energy-efficient and low latency implementation of convolution and fully-connected layers in deep neural networks (DNN) by using physics for computing in parallel resistive memory arrays. However, recurrent neural networks (RNN) that are widely used for speech-recognition and natural language processing have tasted limited success with this approac&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18271v1-abstract-full').style.display = 'inline'; document.getElementById('2411.18271v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.18271v1-abstract-full" style="display: none;"> Analog In-memory Computing (IMC) has demonstrated energy-efficient and low latency implementation of convolution and fully-connected layers in deep neural networks (DNN) by using physics for computing in parallel resistive memory arrays. However, recurrent neural networks (RNN) that are widely used for speech-recognition and natural language processing have tasted limited success with this approach. This can be attributed to the significant time and energy penalties incurred in implementing nonlinear activation functions that are abundant in such models. In this work, we experimentally demonstrate the implementation of a non-linear activation function integrated with a ramp analog-to-digital conversion (ADC) at the periphery of the memory to improve in-memory implementation of RNNs. Our approach uses an extra column of memristors to produce an appropriately pre-distorted ramp voltage such that the comparator output directly approximates the desired nonlinear function. We experimentally demonstrate programming different nonlinear functions using a memristive array and simulate its incorporation in RNNs to solve keyword spotting and language modelling tasks. Compared to other approaches, we demonstrate manifold increase in area-efficiency, energy-efficiency and throughput due to the in-memory, programmable ramp generator that removes digital processing overhead. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18271v1-abstract-full').style.display = 'none'; document.getElementById('2411.18271v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.18266">arXiv:2411.18266</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.18266">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Wearable intelligent throat enables natural speech in stroke patients with dysarthria </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Tang%2C+C">Chenyu Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Gao%2C+S">Shuo Gao</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+C">Cong Li</a>, <a href="/search/cs?searchtype=author&amp;query=Yi%2C+W">Wentian Yi</a>, <a href="/search/cs?searchtype=author&amp;query=Jin%2C+Y">Yuxuan Jin</a>, <a href="/search/cs?searchtype=author&amp;query=Zhai%2C+X">Xiaoxue Zhai</a>, <a href="/search/cs?searchtype=author&amp;query=Lei%2C+S">Sixuan Lei</a>, <a href="/search/cs?searchtype=author&amp;query=Meng%2C+H">Hongbei Meng</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Z">Zibo Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+M">Muzi Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+S">Shengbo Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xuhang Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+C">Chenxi Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+H">Hongyun Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+N">Ningli Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+W">Wenyu Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Cao%2C+J">Jin Cao</a>, <a href="/search/cs?searchtype=author&amp;query=Feng%2C+X">Xiaodong Feng</a>, <a href="/search/cs?searchtype=author&amp;query=Smielewski%2C+P">Peter Smielewski</a>, <a href="/search/cs?searchtype=author&amp;query=Pan%2C+Y">Yu Pan</a>, <a href="/search/cs?searchtype=author&amp;query=Song%2C+W">Wenhui Song</a>, <a href="/search/cs?searchtype=author&amp;query=Birchall%2C+M">Martin Birchall</a>, <a href="/search/cs?searchtype=author&amp;query=Occhipint%2C+L+G">Luigi G. Occhipint</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.18266v1-abstract-short" style="display: inline;"> Wearable silent speech systems hold significant potential for restoring communication in patients with speech impairments. However, seamless, coherent speech remains elusive, and clinical efficacy is still unproven. Here, we present an AI-driven intelligent throat (IT) system that integrates throat muscle vibrations and carotid pulse signal sensors with large language model (LLM) processing to ena&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18266v1-abstract-full').style.display = 'inline'; document.getElementById('2411.18266v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.18266v1-abstract-full" style="display: none;"> Wearable silent speech systems hold significant potential for restoring communication in patients with speech impairments. However, seamless, coherent speech remains elusive, and clinical efficacy is still unproven. Here, we present an AI-driven intelligent throat (IT) system that integrates throat muscle vibrations and carotid pulse signal sensors with large language model (LLM) processing to enable fluent, emotionally expressive communication. The system utilizes ultrasensitive textile strain sensors to capture high-quality signals from the neck area and supports token-level processing for real-time, continuous speech decoding, enabling seamless, delay-free communication. In tests with five stroke patients with dysarthria, IT&#39;s LLM agents intelligently corrected token errors and enriched sentence-level emotional and logical coherence, achieving low error rates (4.2% word error rate, 2.9% sentence error rate) and a 55% increase in user satisfaction. This work establishes a portable, intuitive communication platform for patients with dysarthria with the potential to be applied broadly across different neurological conditions and in multi-language support systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18266v1-abstract-full').style.display = 'none'; document.getElementById('2411.18266v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 figures, 45 references</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.17532">arXiv:2411.17532</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.17532">pdf</a>, <a href="https://arxiv.org/format/2411.17532">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> FTMoMamba: Motion Generation with Frequency and Text State Space Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+C">Chengjian Li</a>, <a href="/search/cs?searchtype=author&amp;query=Shu%2C+X">Xiangbo Shu</a>, <a href="/search/cs?searchtype=author&amp;query=Cui%2C+Q">Qiongjie Cui</a>, <a href="/search/cs?searchtype=author&amp;query=Yao%2C+Y">Yazhou Yao</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+J">Jinhui Tang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.17532v1-abstract-short" style="display: inline;"> Diffusion models achieve impressive performance in human motion generation. However, current approaches typically ignore the significance of frequency-domain information in capturing fine-grained motions within the latent space (e.g., low frequencies correlate with static poses, and high frequencies align with fine-grained motions). Additionally, there is a semantic discrepancy between text and mo&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17532v1-abstract-full').style.display = 'inline'; document.getElementById('2411.17532v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.17532v1-abstract-full" style="display: none;"> Diffusion models achieve impressive performance in human motion generation. However, current approaches typically ignore the significance of frequency-domain information in capturing fine-grained motions within the latent space (e.g., low frequencies correlate with static poses, and high frequencies align with fine-grained motions). Additionally, there is a semantic discrepancy between text and motion, leading to inconsistency between the generated motions and the text descriptions. In this work, we propose a novel diffusion-based FTMoMamba framework equipped with a Frequency State Space Model (FreqSSM) and a Text State Space Model (TextSSM). Specifically, to learn fine-grained representation, FreqSSM decomposes sequences into low-frequency and high-frequency components, guiding the generation of static pose (e.g., sits, lay) and fine-grained motions (e.g., transition, stumble), respectively. To ensure the consistency between text and motion, TextSSM encodes text features at the sentence level, aligning textual semantics with sequential features. Extensive experiments show that FTMoMamba achieves superior performance on the text-to-motion generation task, especially gaining the lowest FID of 0.181 (rather lower than 0.421 of MLD) on the HumanML3D dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17532v1-abstract-full').style.display = 'none'; document.getElementById('2411.17532v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.17431">arXiv:2411.17431</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.17431">pdf</a>, <a href="https://arxiv.org/format/2411.17431">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> </div> </div> <p class="title is-5 mathjax"> Noise Adaptor: Enhancing Low-Latency Spiking Neural Networks through Noise-Injected Low-Bit ANN Conversion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+C">Chen Li</a>, <a href="/search/cs?searchtype=author&amp;query=Rajendran%2C+B">Bipin. Rajendran</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.17431v1-abstract-short" style="display: inline;"> We present Noise Adaptor, a novel method for constructing competitive low-latency spiking neural networks (SNNs) by converting noise-injected, low-bit artificial neural networks (ANNs). This approach builds on existing ANN-to-SNN conversion techniques but offers several key improvements: (1) By injecting noise during quantized ANN training, Noise Adaptor better accounts for the dynamic differences&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17431v1-abstract-full').style.display = 'inline'; document.getElementById('2411.17431v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.17431v1-abstract-full" style="display: none;"> We present Noise Adaptor, a novel method for constructing competitive low-latency spiking neural networks (SNNs) by converting noise-injected, low-bit artificial neural networks (ANNs). This approach builds on existing ANN-to-SNN conversion techniques but offers several key improvements: (1) By injecting noise during quantized ANN training, Noise Adaptor better accounts for the dynamic differences between ANNs and SNNs, significantly enhancing SNN accuracy. (2) Unlike previous methods, Noise Adaptor does not require the application of run-time noise correction techniques in SNNs, thereby avoiding modifications to the spiking neuron model and control flow during inference. (3) Our method extends the capability of handling deeper architectures, achieving successful conversions of activation-quantized ResNet-101 and ResNet-152 to SNNs. We demonstrate the effectiveness of our method on CIFAR-10 and ImageNet, achieving competitive performance. The code will be made available as open-source. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17431v1-abstract-full').style.display = 'none'; document.getElementById('2411.17431v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.17385">arXiv:2411.17385</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.17385">pdf</a>, <a href="https://arxiv.org/format/2411.17385">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> DepthCues: Evaluating Monocular Depth Perception in Large Vision Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Danier%2C+D">Duolikun Danier</a>, <a href="/search/cs?searchtype=author&amp;query=Ayg%C3%BCn%2C+M">Mehmet Ayg眉n</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+C">Changjian Li</a>, <a href="/search/cs?searchtype=author&amp;query=Bilen%2C+H">Hakan Bilen</a>, <a href="/search/cs?searchtype=author&amp;query=Mac+Aodha%2C+O">Oisin Mac Aodha</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.17385v1-abstract-short" style="display: inline;"> Large-scale pre-trained vision models are becoming increasingly prevalent, offering expressive and generalizable visual representations that benefit various downstream tasks. Recent studies on the emergent properties of these models have revealed their high-level geometric understanding, in particular in the context of depth perception. However, it remains unclear how depth perception arises in th&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17385v1-abstract-full').style.display = 'inline'; document.getElementById('2411.17385v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.17385v1-abstract-full" style="display: none;"> Large-scale pre-trained vision models are becoming increasingly prevalent, offering expressive and generalizable visual representations that benefit various downstream tasks. Recent studies on the emergent properties of these models have revealed their high-level geometric understanding, in particular in the context of depth perception. However, it remains unclear how depth perception arises in these models without explicit depth supervision provided during pre-training. To investigate this, we examine whether the monocular depth cues, similar to those used by the human visual system, emerge in these models. We introduce a new benchmark, DepthCues, designed to evaluate depth cue understanding, and present findings across 20 diverse and representative pre-trained vision models. Our analysis shows that human-like depth cues emerge in more recent larger models. We also explore enhancing depth perception in large vision models by fine-tuning on DepthCues, and find that even without dense depth supervision, this improves depth estimation. To support further research, our benchmark and evaluation code will be made publicly available for studying depth perception in vision models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17385v1-abstract-full').style.display = 'none'; document.getElementById('2411.17385v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Website: https://danier97.github.io/depthcues/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.17367">arXiv:2411.17367</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.17367">pdf</a>, <a href="https://arxiv.org/format/2411.17367">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Efficient Deployment of Transformer Models in Analog In-Memory Computing Hardware </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+C">Chen Li</a>, <a href="/search/cs?searchtype=author&amp;query=Lammie%2C+C">Corey Lammie</a>, <a href="/search/cs?searchtype=author&amp;query=Gallo%2C+M+L">Manuel Le Gallo</a>, <a href="/search/cs?searchtype=author&amp;query=Rajendran%2C+B">Bipin Rajendran</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.17367v1-abstract-short" style="display: inline;"> Analog in-memory computing (AIMC) has emerged as a promising solution to overcome the von Neumann bottleneck, accelerating neural network computations and improving computational efficiency. While AIMC has demonstrated success with architectures such as CNNs, MLPs, and RNNs, deploying transformer-based models using AIMC presents unique challenges. Transformers are expected to handle diverse downst&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17367v1-abstract-full').style.display = 'inline'; document.getElementById('2411.17367v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.17367v1-abstract-full" style="display: none;"> Analog in-memory computing (AIMC) has emerged as a promising solution to overcome the von Neumann bottleneck, accelerating neural network computations and improving computational efficiency. While AIMC has demonstrated success with architectures such as CNNs, MLPs, and RNNs, deploying transformer-based models using AIMC presents unique challenges. Transformers are expected to handle diverse downstream tasks and adapt to new user data or instructions after deployment, which requires more flexible approaches to suit AIMC constraints. In this paper, we propose a novel method for deploying pre-trained transformer models onto AIMC hardware. Unlike traditional approaches requiring hardware-aware training, our technique allows direct deployment without the need for retraining the original model. Instead, we utilize lightweight, low-rank adapters -- compact modules stored in digital cores -- to adapt the model to hardware constraints. We validate our approach on MobileBERT, demonstrating accuracy on par with, or even exceeding, a traditional hardware-aware training approach. Our method is particularly appealing in multi-task scenarios, as it enables a single analog model to be reused across multiple tasks. Moreover, it supports on-chip adaptation to new hardware constraints and tasks without updating analog weights, providing a flexible and versatile solution for real-world AI applications. Code is available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17367v1-abstract-full').style.display = 'none'; document.getElementById('2411.17367v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.17088">arXiv:2411.17088</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.17088">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> 惟SFormer: Dual-Modal 惟-like Super-Resolution Transformer Network for Cross-scale and High-accuracy Terraced Field Vectorization Extraction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+C">Chang Li</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yu Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+C">Ce Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Yongjun Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.17088v1-abstract-short" style="display: inline;"> Terraced field is a significant engineering practice for soil and water conservation (SWC). Terraced field extraction from remotely sensed imagery is the foundation for monitoring and evaluating SWC. This study is the first to propose a novel dual-modal 惟-like super-resolution Transformer network for intelligent TFVE, offering the following advantages: (1) reducing edge segmentation error from con&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17088v1-abstract-full').style.display = 'inline'; document.getElementById('2411.17088v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.17088v1-abstract-full" style="display: none;"> Terraced field is a significant engineering practice for soil and water conservation (SWC). Terraced field extraction from remotely sensed imagery is the foundation for monitoring and evaluating SWC. This study is the first to propose a novel dual-modal 惟-like super-resolution Transformer network for intelligent TFVE, offering the following advantages: (1) reducing edge segmentation error from conventional multi-scale downsampling encoder, through fusing original high-resolution features with downsampling features at each step of encoder and leveraging a multi-head attention mechanism; (2) improving the accuracy of TFVE by proposing a 惟-like network structure, which fully integrates rich high-level features from both spectral and terrain data to form cross-scale super-resolution features; (3) validating an optimal fusion scheme for cross-modal and cross-scale (i.e., inconsistent spatial resolution between remotely sensed imagery and DEM) super-resolution feature extraction; (4) mitigating uncertainty between segmentation edge pixels by a coarse-to-fine and spatial topological semantic relationship optimization (STSRO) segmentation strategy; (5) leveraging contour vibration neural network to continuously optimize parameters and iteratively vectorize terraced fields from semantic segmentation results. Moreover, a DMRVD for deep-learning-based TFVE was created for the first time, which covers nine study areas in four provinces of China, with a total coverage area of 22441 square kilometers. To assess the performance of 惟SFormer, classic and SOTA networks were compared. The mIOU of 惟SFormer has improved by 0.165, 0.297 and 0.128 respectively, when compared with best accuracy single-modal remotely sensed imagery, single-modal DEM and dual-modal result. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17088v1-abstract-full').style.display = 'none'; document.getElementById('2411.17088v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.16748">arXiv:2411.16748</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.16748">pdf</a>, <a href="https://arxiv.org/format/2411.16748">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> LetsTalk: Latent Diffusion Transformer for Talking Video Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+H">Haojie Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Liang%2C+Z">Zhihao Liang</a>, <a href="/search/cs?searchtype=author&amp;query=Fu%2C+R">Ruibo Fu</a>, <a href="/search/cs?searchtype=author&amp;query=Wen%2C+Z">Zhengqi Wen</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+X">Xuefei Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+C">Chenxing Li</a>, <a href="/search/cs?searchtype=author&amp;query=Tao%2C+J">Jianhua Tao</a>, <a href="/search/cs?searchtype=author&amp;query=Liang%2C+Y">Yaling Liang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.16748v1-abstract-short" style="display: inline;"> Portrait image animation using audio has rapidly advanced, enabling the creation of increasingly realistic and expressive animated faces. The challenges of this multimodality-guided video generation task involve fusing various modalities while ensuring consistency in timing and portrait. We further seek to produce vivid talking heads. To address these challenges, we present LetsTalk (LatEnt Diffus&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16748v1-abstract-full').style.display = 'inline'; document.getElementById('2411.16748v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.16748v1-abstract-full" style="display: none;"> Portrait image animation using audio has rapidly advanced, enabling the creation of increasingly realistic and expressive animated faces. The challenges of this multimodality-guided video generation task involve fusing various modalities while ensuring consistency in timing and portrait. We further seek to produce vivid talking heads. To address these challenges, we present LetsTalk (LatEnt Diffusion TranSformer for Talking Video Synthesis), a diffusion transformer that incorporates modular temporal and spatial attention mechanisms to merge multimodality and enhance spatial-temporal consistency. To handle multimodal conditions, we first summarize three fusion schemes, ranging from shallow to deep fusion compactness, and thoroughly explore their impact and applicability. Then we propose a suitable solution according to the modality differences of image, audio, and video generation. For portrait, we utilize a deep fusion scheme (Symbiotic Fusion) to ensure portrait consistency. For audio, we implement a shallow fusion scheme (Direct Fusion) to achieve audio-animation alignment while preserving diversity. Our extensive experiments demonstrate that our approach generates temporally coherent and realistic videos with enhanced diversity and liveliness. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16748v1-abstract-full').style.display = 'none'; document.getElementById('2411.16748v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages, 14 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.16525">arXiv:2411.16525</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.16525">pdf</a>, <a href="https://arxiv.org/format/2411.16525">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Fundamental Limits of Prompt Tuning Transformers: Universality, Capacity and Efficiency </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Hu%2C+J+Y">Jerry Yao-Chieh Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+W">Wei-Po Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Gilani%2C+A">Ammar Gilani</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+C">Chenyang Li</a>, <a href="/search/cs?searchtype=author&amp;query=Song%2C+Z">Zhao Song</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+H">Han Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.16525v1-abstract-short" style="display: inline;"> We investigate the statistical and computational limits of prompt tuning for transformer-based foundation models. Our key contributions are prompt tuning on \textit{single-head} transformers with only a \textit{single} self-attention layer: (i) is universal, and (ii) supports efficient (even almost-linear time) algorithms under the Strong Exponential Time Hypothesis (SETH). Statistically, we prove&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16525v1-abstract-full').style.display = 'inline'; document.getElementById('2411.16525v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.16525v1-abstract-full" style="display: none;"> We investigate the statistical and computational limits of prompt tuning for transformer-based foundation models. Our key contributions are prompt tuning on \textit{single-head} transformers with only a \textit{single} self-attention layer: (i) is universal, and (ii) supports efficient (even almost-linear time) algorithms under the Strong Exponential Time Hypothesis (SETH). Statistically, we prove that prompt tuning on such simplest possible transformers are universal approximators for sequence-to-sequence Lipschitz functions. In addition, we provide an exponential-in-$dL$ and -in-$(1/蔚)$ lower bound on the required soft-prompt tokens for prompt tuning to memorize any dataset with 1-layer, 1-head transformers. Computationally, we identify a phase transition in the efficiency of prompt tuning, determined by the norm of the \textit{soft-prompt-induced} keys and queries, and provide an upper bound criterion. Beyond this criterion, no sub-quadratic (efficient) algorithm for prompt tuning exists under SETH. Within this criterion, we showcase our theory by proving the existence of almost-linear time prompt tuning inference algorithms. These fundamental limits provide important necessary conditions for designing expressive and efficient prompt tuning methods for practitioners. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16525v1-abstract-full').style.display = 'none'; document.getElementById('2411.16525v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.16503">arXiv:2411.16503</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.16503">pdf</a>, <a href="https://arxiv.org/format/2411.16503">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Noise Diffusion for Enhancing Semantic Faithfulness in Text-to-Image Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Miao%2C+B">Boming Miao</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+C">Chunxiao Li</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+X">Xiaoxiao Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+A">Andi Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+R">Rui Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Z">Zizhe Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+Y">Yao Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.16503v1-abstract-short" style="display: inline;"> Diffusion models have achieved impressive success in generating photorealistic images, but challenges remain in ensuring precise semantic alignment with input prompts. Optimizing the initial noisy latent offers a more efficient alternative to modifying model architectures or prompt engineering for improving semantic alignment. A latest approach, InitNo, refines the initial noisy latent by leveragi&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16503v1-abstract-full').style.display = 'inline'; document.getElementById('2411.16503v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.16503v1-abstract-full" style="display: none;"> Diffusion models have achieved impressive success in generating photorealistic images, but challenges remain in ensuring precise semantic alignment with input prompts. Optimizing the initial noisy latent offers a more efficient alternative to modifying model architectures or prompt engineering for improving semantic alignment. A latest approach, InitNo, refines the initial noisy latent by leveraging attention maps; however, these maps capture only limited information, and the effectiveness of InitNo is highly dependent on the initial starting point, as it tends to converge on a local optimum near this point. To this end, this paper proposes leveraging the language comprehension capabilities of large vision-language models (LVLMs) to guide the optimization of the initial noisy latent, and introduces the Noise Diffusion process, which updates the noisy latent to generate semantically faithful images while preserving distribution consistency. Furthermore, we provide a theoretical analysis of the condition under which the update improves semantic faithfulness. Experimental results demonstrate the effectiveness and adaptability of our framework, consistently enhancing semantic alignment across various diffusion models. The code is available at https://github.com/Bomingmiao/NoiseDiffusion. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16503v1-abstract-full').style.display = 'none'; document.getElementById('2411.16503v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.16446">arXiv:2411.16446</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.16446">pdf</a>, <a href="https://arxiv.org/format/2411.16446">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> VQ-SGen: A Vector Quantized Stroke Representation for Sketch Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+J">Jiawei Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Cui%2C+Z">Zhiming Cui</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+C">Changjian Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.16446v1-abstract-short" style="display: inline;"> This paper presents VQ-SGen, a novel algorithm for high-quality sketch generation. Recent approaches have often framed the task as pixel-based generation either as a whole or part-by-part, neglecting the intrinsic and contextual relationships among individual strokes, such as the shape and spatial positioning of both proximal and distant strokes. To overcome these limitations, we propose treating&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16446v1-abstract-full').style.display = 'inline'; document.getElementById('2411.16446v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.16446v1-abstract-full" style="display: none;"> This paper presents VQ-SGen, a novel algorithm for high-quality sketch generation. Recent approaches have often framed the task as pixel-based generation either as a whole or part-by-part, neglecting the intrinsic and contextual relationships among individual strokes, such as the shape and spatial positioning of both proximal and distant strokes. To overcome these limitations, we propose treating each stroke within a sketch as an entity and introducing a vector-quantized (VQ) stroke representation for fine-grained sketch generation. Our method follows a two-stage framework - in the first stage, we decouple each stroke&#39;s shape and location information to ensure the VQ representation prioritizes stroke shape learning. In the second stage, we feed the precise and compact representation into an auto-decoding Transformer to incorporate stroke semantics, positions, and shapes into the generation process. By utilizing tokenized stroke representation, our approach generates strokes with high fidelity and facilitates novel applications, such as conditional generation and semantic-aware stroke editing. Comprehensive experiments demonstrate our method surpasses existing state-of-the-art techniques, underscoring its effectiveness. The code and model will be made publicly available upon publication. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16446v1-abstract-full').style.display = 'none'; document.getElementById('2411.16446v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.16253">arXiv:2411.16253</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.16253">pdf</a>, <a href="https://arxiv.org/format/2411.16253">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Open-Vocabulary Octree-Graph for 3D Scene Understanding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Z">Zhigang Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Su%2C+Y">Yifei Su</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+C">Chenhui Li</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+D">Dong Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+Y">Yan Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+B">Bin Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+X">Xuelong Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.16253v1-abstract-short" style="display: inline;"> Open-vocabulary 3D scene understanding is indispensable for embodied agents. Recent works leverage pretrained vision-language models (VLMs) for object segmentation and project them to point clouds to build 3D maps. Despite progress, a point cloud is a set of unordered coordinates that requires substantial storage space and does not directly convey occupancy information or spatial relation, making&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16253v1-abstract-full').style.display = 'inline'; document.getElementById('2411.16253v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.16253v1-abstract-full" style="display: none;"> Open-vocabulary 3D scene understanding is indispensable for embodied agents. Recent works leverage pretrained vision-language models (VLMs) for object segmentation and project them to point clouds to build 3D maps. Despite progress, a point cloud is a set of unordered coordinates that requires substantial storage space and does not directly convey occupancy information or spatial relation, making existing methods inefficient for downstream tasks, e.g., path planning and complex text-based object retrieval. To address these issues, we propose Octree-Graph, a novel scene representation for open-vocabulary 3D scene understanding. Specifically, a Chronological Group-wise Segment Merging (CGSM) strategy and an Instance Feature Aggregation (IFA) algorithm are first designed to get 3D instances and corresponding semantic features. Subsequently, an adaptive-octree structure is developed that stores semantics and depicts the occupancy of an object adjustably according to its shape. Finally, the Octree-Graph is constructed where each adaptive-octree acts as a graph node, and edges describe the spatial relations among nodes. Extensive experiments on various tasks are conducted on several widely-used datasets, demonstrating the versatility and effectiveness of our method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16253v1-abstract-full').style.display = 'none'; document.getElementById('2411.16253v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11pages,7figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.16216">arXiv:2411.16216</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.16216">pdf</a>, <a href="https://arxiv.org/format/2411.16216">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SMGDiff: Soccer Motion Generation using diffusion probabilistic models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yang%2C+H">Hongdi Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+C">Chengyang Li</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+Z">Zhenxuan Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+G">Gaozheng Li</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+J">Jingya Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+J">Jingyi Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Su%2C+Z">Zhuo Su</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+L">Lan Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.16216v1-abstract-short" style="display: inline;"> Soccer is a globally renowned sport with significant applications in video games and VR/AR. However, generating realistic soccer motions remains challenging due to the intricate interactions between the human player and the ball. In this paper, we introduce SMGDiff, a novel two-stage framework for generating real-time and user-controllable soccer motions. Our key idea is to integrate real-time cha&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16216v1-abstract-full').style.display = 'inline'; document.getElementById('2411.16216v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.16216v1-abstract-full" style="display: none;"> Soccer is a globally renowned sport with significant applications in video games and VR/AR. However, generating realistic soccer motions remains challenging due to the intricate interactions between the human player and the ball. In this paper, we introduce SMGDiff, a novel two-stage framework for generating real-time and user-controllable soccer motions. Our key idea is to integrate real-time character control with a powerful diffusion-based generative model, ensuring high-quality and diverse output motion. In the first stage, we instantly transform coarse user controls into diverse global trajectories of the character. In the second stage, we employ a transformer-based autoregressive diffusion model to generate soccer motions based on trajectory conditioning. We further incorporate a contact guidance module during inference to optimize the contact details for realistic ball-foot interactions. Moreover, we contribute a large-scale soccer motion dataset consisting of over 1.08 million frames of diverse soccer motions. Extensive experiments demonstrate that our SMGDiff significantly outperforms existing methods in terms of motion quality and condition alignment. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16216v1-abstract-full').style.display = 'none'; document.getElementById('2411.16216v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.16095">arXiv:2411.16095</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.16095">pdf</a>, <a href="https://arxiv.org/format/2411.16095">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> LDACP: Long-Delayed Ad Conversions Prediction Model for Bidding Strategy </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Cui%2C+P">Peng Cui</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+Y">Yiming Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Jin%2C+F">Fusheng Jin</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+S">Siyuan Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yunli Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+F">Fukang Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Jia%2C+Y">Yalong Jia</a>, <a href="/search/cs?searchtype=author&amp;query=Cai%2C+Q">Qingpeng Cai</a>, <a href="/search/cs?searchtype=author&amp;query=Pan%2C+F">Fei Pan</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+C">Changcheng Li</a>, <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+P">Peng Jiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.16095v1-abstract-short" style="display: inline;"> In online advertising, once an ad campaign is deployed, the automated bidding system dynamically adjusts the bidding strategy to optimize Cost Per Action (CPA) based on the number of ad conversions. For ads with a long conversion delay, relying solely on the real-time tracked conversion number as a signal for bidding strategy can significantly overestimate the current CPA, leading to conservative&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16095v1-abstract-full').style.display = 'inline'; document.getElementById('2411.16095v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.16095v1-abstract-full" style="display: none;"> In online advertising, once an ad campaign is deployed, the automated bidding system dynamically adjusts the bidding strategy to optimize Cost Per Action (CPA) based on the number of ad conversions. For ads with a long conversion delay, relying solely on the real-time tracked conversion number as a signal for bidding strategy can significantly overestimate the current CPA, leading to conservative bidding strategies. Therefore, it is crucial to predict the number of long-delayed conversions. Nonetheless, it is challenging to predict ad conversion numbers through traditional regression methods due to the wide range of ad conversion numbers. Previous regression works have addressed this challenge by transforming regression problems into bucket classification problems, achieving success in various scenarios. However, specific challenges arise when predicting the number of ad conversions: 1) The integer nature of ad conversion numbers exacerbates the discontinuity issue in one-hot hard labels; 2) The long-tail distribution of ad conversion numbers complicates tail data prediction. In this paper, we propose the Long-Delayed Ad Conversions Prediction model for bidding strategy (LDACP), which consists of two sub-modules. To alleviate the issue of discontinuity in one-hot hard labels, the Bucket Classification Module with label Smoothing method (BCMS) converts one-hot hard labels into non-normalized soft labels, then fits these soft labels by minimizing classification loss and regression loss. To address the challenge of predicting tail data, the Value Regression Module with Proxy labels (VRMP) uses the prediction bias of aggregated pCTCVR as proxy labels. Finally, a Mixture of Experts (MoE) structure integrates the predictions from BCMS and VRMP to obtain the final predicted ad conversion number. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16095v1-abstract-full').style.display = 'none'; document.getElementById('2411.16095v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 8 figures, 6 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15871">arXiv:2411.15871</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.15871">pdf</a>, <a href="https://arxiv.org/format/2411.15871">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> Hiding Communication Cost in Distributed LLM Training via Micro-batch Co-execution </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+H">Haiquan Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Ruan%2C+C">Chaoyi Ruan</a>, <a href="/search/cs?searchtype=author&amp;query=He%2C+J">Jia He</a>, <a href="/search/cs?searchtype=author&amp;query=Ruan%2C+J">Jiaqi Ruan</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+C">Chengjie Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+X">Xiaosong Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+C">Cheng Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15871v1-abstract-short" style="display: inline;"> The growth of Large Language Models (LLMs) has necessitated large-scale distributed training. Highly optimized frameworks, however, still suffer significant losses in Model FLOPS utilization (often below 50%) due to large communication volumes. Meanwhile, our comprehensive profiling shows that the computation- and communication-intensive operators overlap well. This paper introduces DHelix, a no&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15871v1-abstract-full').style.display = 'inline'; document.getElementById('2411.15871v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15871v1-abstract-full" style="display: none;"> The growth of Large Language Models (LLMs) has necessitated large-scale distributed training. Highly optimized frameworks, however, still suffer significant losses in Model FLOPS utilization (often below 50%) due to large communication volumes. Meanwhile, our comprehensive profiling shows that the computation- and communication-intensive operators overlap well. This paper introduces DHelix, a novel micro-structure that dramatically improves the efficiency of LLM training inspired by the DNA structure. Central to DHelix&#39;s design is Strand Interleaving (SI), which views the continuous stream of training micro-batches through a GPU as two strands. DHelix juxtaposes the forward and backward passes of the two strands and performs a systematic optimization for an SI plan that co-schedules the operators from the opposite strands, enabled by operator-level overlap profiling results and a dynamic-programming based search algorithm. Meanwhile, DHelix enables the two strands to share model states and space for activation data, effectively accommodating two micro-batches with under 3% extra memory space. Dhelix seamlessly integrates with all forms of existing data/model parallelism, the most challenging being pipeline parallelism, thanks to its unique model folding design that results in a W-shaped pipeline. We evaluate DHelix training with the popular Llama and GPT dense models, plus the Phi Mixture of Expert (MoE) model, across 3 GPU clusters (A40, A800, and H100). Results show that it achieves 12-40% (up to 58% MFU) and 2-29% (up to 71% MFU) improvement on the 64-A40 and 64-A800 clusters, respectively, significantly outperforming state-of-the-art methods. On the H100 cluster, though the faster network reduces DHelix&#39;s profit margin, it makes cross-node tensor parallelism promising, a practice currently prohibitive due to communication costs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15871v1-abstract-full').style.display = 'none'; document.getElementById('2411.15871v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15770">arXiv:2411.15770</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.15770">pdf</a>, <a href="https://arxiv.org/format/2411.15770">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Text-Guided Coarse-to-Fine Fusion Network for Robust Remote Sensing Visual Question Answering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+Z">Zhicheng Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+C">Changfu Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Yu Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+C">Chenglong Li</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+X">Xiaoliang Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+J">Jin Tang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15770v1-abstract-short" style="display: inline;"> Remote Sensing Visual Question Answering (RSVQA) has gained significant research interest. However, current RSVQA methods are limited by the imaging mechanisms of optical sensors, particularly under challenging conditions such as cloud-covered and low-light scenarios. Given the all-time and all-weather imaging capabilities of Synthetic Aperture Radar (SAR), it is crucial to investigate the integra&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15770v1-abstract-full').style.display = 'inline'; document.getElementById('2411.15770v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15770v1-abstract-full" style="display: none;"> Remote Sensing Visual Question Answering (RSVQA) has gained significant research interest. However, current RSVQA methods are limited by the imaging mechanisms of optical sensors, particularly under challenging conditions such as cloud-covered and low-light scenarios. Given the all-time and all-weather imaging capabilities of Synthetic Aperture Radar (SAR), it is crucial to investigate the integration of optical-SAR images to improve RSVQA performance. In this work, we propose a Text-guided Coarse-to-Fine Fusion Network (TGFNet), which leverages the semantic relationships between question text and multi-source images to guide the network toward complementary fusion at the feature level. Specifically, we develop a Text-guided Coarse-to-Fine Attention Refinement (CFAR) module to focus on key areas related to the question in complex remote sensing images. This module progressively directs attention from broad areas to finer details through key region routing, enhancing the model&#39;s ability to focus on relevant regions. Furthermore, we propose an Adaptive Multi-Expert Fusion (AMEF) module that dynamically integrates different experts, enabling the adaptive fusion of optical and SAR features. In addition, we create the first large-scale benchmark dataset for evaluating optical-SAR RSVQA methods, comprising 6,008 well-aligned optical-SAR image pairs and 1,036,694 well-labeled question-answer pairs across 16 diverse question types, including complex relational reasoning questions. Extensive experiments on the proposed dataset demonstrate that our TGFNet effectively integrates complementary information between optical and SAR images, significantly improving the model&#39;s performance in challenging scenarios. The dataset is available at: https://github.com/mmic-lcl/. Index Terms: Remote Sensing Visual Question Answering, Multi-source Data Fusion, Multimodal, Remote Sensing, OPT-SAR. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15770v1-abstract-full').style.display = 'none'; document.getElementById('2411.15770v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15576">arXiv:2411.15576</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.15576">pdf</a>, <a href="https://arxiv.org/format/2411.15576">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MulModSeg: Enhancing Unpaired Multi-Modal Medical Image Segmentation with Modality-Conditioned Text Embedding and Alternating Training </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+C">Chengyin Li</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+H">Hui Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Sultan%2C+R+I">Rafi Ibn Sultan</a>, <a href="/search/cs?searchtype=author&amp;query=Ebadian%2C+H+B">Hassan Bagher Ebadian</a>, <a href="/search/cs?searchtype=author&amp;query=Khanduri%2C+P">Prashant Khanduri</a>, <a href="/search/cs?searchtype=author&amp;query=Indrin%2C+C">Chetty Indrin</a>, <a href="/search/cs?searchtype=author&amp;query=Thind%2C+K">Kundan Thind</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+D">Dongxiao Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15576v1-abstract-short" style="display: inline;"> In the diverse field of medical imaging, automatic segmentation has numerous applications and must handle a wide variety of input domains, such as different types of Computed Tomography (CT) scans and Magnetic Resonance (MR) images. This heterogeneity challenges automatic segmentation algorithms to maintain consistent performance across different modalities due to the requirement for spatially ali&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15576v1-abstract-full').style.display = 'inline'; document.getElementById('2411.15576v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15576v1-abstract-full" style="display: none;"> In the diverse field of medical imaging, automatic segmentation has numerous applications and must handle a wide variety of input domains, such as different types of Computed Tomography (CT) scans and Magnetic Resonance (MR) images. This heterogeneity challenges automatic segmentation algorithms to maintain consistent performance across different modalities due to the requirement for spatially aligned and paired images. Typically, segmentation models are trained using a single modality, which limits their ability to generalize to other types of input data without employing transfer learning techniques. Additionally, leveraging complementary information from different modalities to enhance segmentation precision often necessitates substantial modifications to popular encoder-decoder designs, such as introducing multiple branched encoding or decoding paths for each modality. In this work, we propose a simple Multi-Modal Segmentation (MulModSeg) strategy to enhance medical image segmentation across multiple modalities, specifically CT and MR. It incorporates two key designs: a modality-conditioned text embedding framework via a frozen text encoder that adds modality awareness to existing segmentation frameworks without significant structural modifications or computational overhead, and an alternating training procedure that facilitates the integration of essential features from unpaired CT and MR inputs. Through extensive experiments with both Fully Convolutional Network and Transformer-based backbones, MulModSeg consistently outperforms previous methods in segmenting abdominal multi-organ and cardiac substructures for both CT and MR modalities. The code is available in this {\href{https://github.com/ChengyinLee/MulModSeg_2024}{link}}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15576v1-abstract-full').style.display = 'none'; document.getElementById('2411.15576v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by WACV-2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14951">arXiv:2411.14951</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.14951">pdf</a>, <a href="https://arxiv.org/format/2411.14951">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Morph: A Motion-free Physics Optimization Framework for Human Motion Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+Z">Zhuo Li</a>, <a href="/search/cs?searchtype=author&amp;query=Luo%2C+M">Mingshuang Luo</a>, <a href="/search/cs?searchtype=author&amp;query=Hou%2C+R">Ruibing Hou</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+X">Xin Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+H">Hao Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Chang%2C+H">Hong Chang</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zimo Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+C">Chen Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14951v1-abstract-short" style="display: inline;"> Human motion generation plays a vital role in applications such as digital humans and humanoid robot control. However, most existing approaches disregard physics constraints, leading to the frequent production of physically implausible motions with pronounced artifacts such as floating and foot sliding. In this paper, we propose \textbf{Morph}, a \textbf{Mo}tion-f\textbf{r}ee \textbf{ph}ysics opti&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14951v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14951v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14951v1-abstract-full" style="display: none;"> Human motion generation plays a vital role in applications such as digital humans and humanoid robot control. However, most existing approaches disregard physics constraints, leading to the frequent production of physically implausible motions with pronounced artifacts such as floating and foot sliding. In this paper, we propose \textbf{Morph}, a \textbf{Mo}tion-f\textbf{r}ee \textbf{ph}ysics optimization framework, comprising a Motion Generator and a Motion Physics Refinement module, for enhancing physical plausibility without relying on costly real-world motion data. Specifically, the Motion Generator is responsible for providing large-scale synthetic motion data, while the Motion Physics Refinement Module utilizes these synthetic data to train a motion imitator within a physics simulator, enforcing physical constraints to project the noisy motions into a physically-plausible space. These physically refined motions, in turn, are used to fine-tune the Motion Generator, further enhancing its capability. Experiments on both text-to-motion and music-to-dance generation tasks demonstrate that our framework achieves state-of-the-art motion generation quality while improving physical plausibility drastically. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14951v1-abstract-full').style.display = 'none'; document.getElementById('2411.14951v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14786">arXiv:2411.14786</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.14786">pdf</a>, <a href="https://arxiv.org/format/2411.14786">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> FastGrasp: Efficient Grasp Synthesis with Diffusion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wu%2C+X">Xiaofei Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+T">Tao Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+C">Caoji Li</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+Y">Yuexin Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Shi%2C+Y">Yujiao Shi</a>, <a href="/search/cs?searchtype=author&amp;query=He%2C+X">Xuming He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14786v1-abstract-short" style="display: inline;"> Effectively modeling the interaction between human hands and objects is challenging due to the complex physical constraints and the requirement for high generation efficiency in applications. Prior approaches often employ computationally intensive two-stage approaches, which first generate an intermediate representation, such as contact maps, followed by an iterative optimization procedure that up&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14786v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14786v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14786v1-abstract-full" style="display: none;"> Effectively modeling the interaction between human hands and objects is challenging due to the complex physical constraints and the requirement for high generation efficiency in applications. Prior approaches often employ computationally intensive two-stage approaches, which first generate an intermediate representation, such as contact maps, followed by an iterative optimization procedure that updates hand meshes to capture the hand-object relation. However, due to the high computation complexity during the optimization stage, such strategies often suffer from low efficiency in inference. To address this limitation, this work introduces a novel diffusion-model-based approach that generates the grasping pose in a one-stage manner. This allows us to significantly improve generation speed and the diversity of generated hand poses. In particular, we develop a Latent Diffusion Model with an Adaptation Module for object-conditioned hand pose generation and a contact-aware loss to enforce the physical constraints between hands and objects. Extensive experiments demonstrate that our method achieves faster inference, higher diversity, and superior pose quality than state-of-the-art approaches. Code is available at \href{https://github.com/wuxiaofei01/FastGrasp}{https://github.com/wuxiaofei01/FastGrasp.} <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14786v1-abstract-full').style.display = 'none'; document.getElementById('2411.14786v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14158">arXiv:2411.14158</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.14158">pdf</a>, <a href="https://arxiv.org/format/2411.14158">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Point Cloud Denoising With Fine-Granularity Dynamic Graph Convolutional Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Xu%2C+W">Wenqiang Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Dai%2C+W">Wenrui Dai</a>, <a href="/search/cs?searchtype=author&amp;query=Xue%2C+D">Duoduo Xue</a>, <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+Z">Ziyang Zheng</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+C">Chenglin Li</a>, <a href="/search/cs?searchtype=author&amp;query=Zou%2C+J">Junni Zou</a>, <a href="/search/cs?searchtype=author&amp;query=Xiong%2C+H">Hongkai Xiong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14158v1-abstract-short" style="display: inline;"> Due to limitations in acquisition equipment, noise perturbations often corrupt 3-D point clouds, hindering down-stream tasks such as surface reconstruction, rendering, and further processing. Existing 3-D point cloud denoising methods typically fail to reliably fit the underlying continuous surface, resulting in a degradation of reconstruction performance. This paper introduces fine-granularity dy&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14158v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14158v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14158v1-abstract-full" style="display: none;"> Due to limitations in acquisition equipment, noise perturbations often corrupt 3-D point clouds, hindering down-stream tasks such as surface reconstruction, rendering, and further processing. Existing 3-D point cloud denoising methods typically fail to reliably fit the underlying continuous surface, resulting in a degradation of reconstruction performance. This paper introduces fine-granularity dynamic graph convolutional networks called GD-GCN, a novel approach to denoising in 3-D point clouds. The GD-GCN employs micro-step temporal graph convolution (MST-GConv) to perform feature learning in a gradual manner. Compared with the conventional GCN, which commonly uses discrete integer-step graph convolution, this modification introduces a more adaptable and nuanced approach to feature learning within graph convolution networks. It more accurately depicts the process of fitting the point cloud with noise to the underlying surface by and the learning process for MST-GConv acts like a changing system and is managed through a type of neural network known as neural Partial Differential Equations (PDEs). This means it can adapt and improve over time. GD-GCN approximates the Riemannian metric, calculating distances between points along a low-dimensional manifold. This capability allows it to understand the local geometric structure and effectively capture diverse relationships between points from different geometric regions through geometric graph construction based on Riemannian distances. Additionally, GD-GCN incorporates robust graph spectral filters based on the Bernstein polynomial approximation, which modulate eigenvalues for complex and arbitrary spectral responses, providing theoretical guarantees for BIBO stability. Symmetric channel mixing matrices further enhance filter flexibility by enabling channel-level scaling and shifting in the spectral domain. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14158v1-abstract-full').style.display = 'none'; document.getElementById('2411.14158v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14120">arXiv:2411.14120</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.14120">pdf</a>, <a href="https://arxiv.org/format/2411.14120">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Point Cloud Resampling with Learnable Heat Diffusion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Xu%2C+W">Wenqiang Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Dai%2C+W">Wenrui Dai</a>, <a href="/search/cs?searchtype=author&amp;query=Xue%2C+D">Duoduo Xue</a>, <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+Z">Ziyang Zheng</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+C">Chenglin Li</a>, <a href="/search/cs?searchtype=author&amp;query=Zou%2C+J">Junni Zou</a>, <a href="/search/cs?searchtype=author&amp;query=Xiong%2C+H">Hongkai Xiong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14120v1-abstract-short" style="display: inline;"> Generative diffusion models have shown empirical successes in point cloud resampling, generating a denser and more uniform distribution of points from sparse or noisy 3D point clouds by progressively refining noise into structure. However, existing diffusion models employ manually predefined schemes, which often fail to recover the underlying point cloud structure due to the rigid and disruptive n&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14120v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14120v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14120v1-abstract-full" style="display: none;"> Generative diffusion models have shown empirical successes in point cloud resampling, generating a denser and more uniform distribution of points from sparse or noisy 3D point clouds by progressively refining noise into structure. However, existing diffusion models employ manually predefined schemes, which often fail to recover the underlying point cloud structure due to the rigid and disruptive nature of the geometric degradation. To address this issue, we propose a novel learnable heat diffusion framework for point cloud resampling, which directly parameterizes the marginal distribution for the forward process by learning the adaptive heat diffusion schedules and local filtering scales of the time-varying heat kernel, and consequently, generates an adaptive conditional prior for the reverse process. Unlike previous diffusion models with a fixed prior, the adaptive conditional prior selectively preserves geometric features of the point cloud by minimizing a refined variational lower bound, guiding the points to evolve towards the underlying surface during the reverse process. Extensive experimental results demonstrate that the proposed point cloud resampling achieves state-of-the-art performance in representative reconstruction tasks including point cloud denoising and upsampling. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14120v1-abstract-full').style.display = 'none'; document.getElementById('2411.14120v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13162">arXiv:2411.13162</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.13162">pdf</a>, <a href="https://arxiv.org/format/2411.13162">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> </div> </div> <p class="title is-5 mathjax"> IC Mechanisms for Risk-Averse Advertisers in the Online Advertising System </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+B">Bingzhe Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Qian%2C+R">Ruohan Qian</a>, <a href="/search/cs?searchtype=author&amp;query=Dou%2C+Y">Yuejia Dou</a>, <a href="/search/cs?searchtype=author&amp;query=Qi%2C+Q">Qi Qi</a>, <a href="/search/cs?searchtype=author&amp;query=Shen%2C+B">Bo Shen</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+C">Changyuan Li</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Yixuan Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Su%2C+Y">Yixin Su</a>, <a href="/search/cs?searchtype=author&amp;query=Yuan%2C+X">Xin Yuan</a>, <a href="/search/cs?searchtype=author&amp;query=liu%2C+W">Wenqiang liu</a>, <a href="/search/cs?searchtype=author&amp;query=Zou%2C+B">Bin Zou</a>, <a href="/search/cs?searchtype=author&amp;query=Yi%2C+W">Wen Yi</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+Z">Zhi Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+S">Shuanglong Li</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+L">Liu Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13162v1-abstract-short" style="display: inline;"> The autobidding system generates huge revenue for advertising platforms, garnering substantial research attention. Existing studies in autobidding systems focus on designing Autobidding Incentive Compatible (AIC) mechanisms, where the mechanism is Incentive Compatible (IC) under ex ante expectations. However, upon deploying AIC mechanisms in advertising platforms, we observe a notable deviation be&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13162v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13162v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13162v1-abstract-full" style="display: none;"> The autobidding system generates huge revenue for advertising platforms, garnering substantial research attention. Existing studies in autobidding systems focus on designing Autobidding Incentive Compatible (AIC) mechanisms, where the mechanism is Incentive Compatible (IC) under ex ante expectations. However, upon deploying AIC mechanisms in advertising platforms, we observe a notable deviation between the actual auction outcomes and these expectations during runtime, particularly in the scene with few clicks (sparse-click). This discrepancy undermines truthful bidding among advertisers in AIC mechanisms, especially for risk-averse advertisers who are averse to outcomes that do not align with the expectations. To address this issue, we propose a mechanism, Decoupled First-Price Auction (DFP), that retains its IC property even during runtime. DFP dynamically adjusts the payment based on real-time user conversion outcomes, ensuring that advertisers&#39; realized utilities closely approximate their expected utilities during runtime. To realize the payment mechanism of DFP, we propose a PPO-based RL algorithm, with a meticulously crafted reward function. This algorithm dynamically adjusts the payment to fit DFP mechanism. We conduct extensive experiments leveraging real-world data to validate our findings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13162v1-abstract-full').style.display = 'none'; document.getElementById('2411.13162v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13144">arXiv:2411.13144</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.13144">pdf</a>, <a href="https://arxiv.org/format/2411.13144">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> CopyrightMeter: Revisiting Copyright Protection in Text-to-image Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Xu%2C+N">Naen Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+C">Changjiang Li</a>, <a href="/search/cs?searchtype=author&amp;query=Du%2C+T">Tianyu Du</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+M">Minxi Li</a>, <a href="/search/cs?searchtype=author&amp;query=Luo%2C+W">Wenjie Luo</a>, <a href="/search/cs?searchtype=author&amp;query=Liang%2C+J">Jiacheng Liang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Yuyuan Li</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+X">Xuhong Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Han%2C+M">Meng Han</a>, <a href="/search/cs?searchtype=author&amp;query=Yin%2C+J">Jianwei Yin</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+T">Ting Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13144v1-abstract-short" style="display: inline;"> Text-to-image diffusion models have emerged as powerful tools for generating high-quality images from textual descriptions. However, their increasing popularity has raised significant copyright concerns, as these models can be misused to reproduce copyrighted content without authorization. In response, recent studies have proposed various copyright protection methods, including adversarial perturb&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13144v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13144v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13144v1-abstract-full" style="display: none;"> Text-to-image diffusion models have emerged as powerful tools for generating high-quality images from textual descriptions. However, their increasing popularity has raised significant copyright concerns, as these models can be misused to reproduce copyrighted content without authorization. In response, recent studies have proposed various copyright protection methods, including adversarial perturbation, concept erasure, and watermarking techniques. However, their effectiveness and robustness against advanced attacks remain largely unexplored. Moreover, the lack of unified evaluation frameworks has hindered systematic comparison and fair assessment of different approaches. To bridge this gap, we systematize existing copyright protection methods and attacks, providing a unified taxonomy of their design spaces. We then develop CopyrightMeter, a unified evaluation framework that incorporates 17 state-of-the-art protections and 16 representative attacks. Leveraging CopyrightMeter, we comprehensively evaluate protection methods across multiple dimensions, thereby uncovering how different design choices impact fidelity, efficacy, and resilience under attacks. Our analysis reveals several key findings: (i) most protections (16/17) are not resilient against attacks; (ii) the &#34;best&#34; protection varies depending on the target priority; (iii) more advanced attacks significantly promote the upgrading of protections. These insights provide concrete guidance for developing more robust protection methods, while its unified evaluation protocol establishes a standard benchmark for future copyright protection research in text-to-image generation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13144v1-abstract-full').style.display = 'none'; document.getElementById('2411.13144v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13037">arXiv:2411.13037</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.13037">pdf</a>, <a href="https://arxiv.org/format/2411.13037">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantum Physics">quant-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Emerging Technologies">cs.ET</span> </div> </div> <p class="title is-5 mathjax"> Machine Learning for Arbitrary Single-Qubit Rotations on an Embedded Device </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Bhat%2C+M+N">Madhav Narayan Bhat</a>, <a href="/search/cs?searchtype=author&amp;query=Russo%2C+M">Marco Russo</a>, <a href="/search/cs?searchtype=author&amp;query=Carloni%2C+L+P">Luca P. Carloni</a>, <a href="/search/cs?searchtype=author&amp;query=Di+Guglielmo%2C+G">Giuseppe Di Guglielmo</a>, <a href="/search/cs?searchtype=author&amp;query=Fahim%2C+F">Farah Fahim</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+A+C+Y">Andy C. Y. Li</a>, <a href="/search/cs?searchtype=author&amp;query=Perdue%2C+G+N">Gabriel N. Perdue</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13037v1-abstract-short" style="display: inline;"> Here we present a technique for using machine learning (ML) for single-qubit gate synthesis on field programmable logic for a superconducting transmon-based quantum computer based on simulated studies. Our approach is multi-stage. We first bootstrap a model based on simulation with access to the full statevector for measuring gate fidelity. We next present an algorithm, named adapted randomized be&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13037v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13037v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13037v1-abstract-full" style="display: none;"> Here we present a technique for using machine learning (ML) for single-qubit gate synthesis on field programmable logic for a superconducting transmon-based quantum computer based on simulated studies. Our approach is multi-stage. We first bootstrap a model based on simulation with access to the full statevector for measuring gate fidelity. We next present an algorithm, named adapted randomized benchmarking (ARB), for fine-tuning the gate on hardware based on measurements of the devices. We also present techniques for deploying the model on programmable devices with care to reduce the required resources. While the techniques here are applied to a transmon-based computer, many of them are portable to other architectures. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13037v1-abstract-full').style.display = 'none'; document.getElementById('2411.13037v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12762">arXiv:2411.12762</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.12762">pdf</a>, <a href="https://arxiv.org/format/2411.12762">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Playing Language Game with LLMs Leads to Jailbreaking </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Peng%2C+Y">Yu Peng</a>, <a href="/search/cs?searchtype=author&amp;query=Long%2C+Z">Zewen Long</a>, <a href="/search/cs?searchtype=author&amp;query=Dong%2C+F">Fangming Dong</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+C">Congyi Li</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+S">Shu Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+K">Kai Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12762v2-abstract-short" style="display: inline;"> The advent of large language models (LLMs) has spurred the development of numerous jailbreak techniques aimed at circumventing their security defenses against malicious attacks. An effective jailbreak approach is to identify a domain where safety generalization fails, a phenomenon known as mismatched generalization. In this paper, we introduce two novel jailbreak methods based on mismatched genera&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12762v2-abstract-full').style.display = 'inline'; document.getElementById('2411.12762v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12762v2-abstract-full" style="display: none;"> The advent of large language models (LLMs) has spurred the development of numerous jailbreak techniques aimed at circumventing their security defenses against malicious attacks. An effective jailbreak approach is to identify a domain where safety generalization fails, a phenomenon known as mismatched generalization. In this paper, we introduce two novel jailbreak methods based on mismatched generalization: natural language games and custom language games, both of which effectively bypass the safety mechanisms of LLMs, with various kinds and different variants, making them hard to defend and leading to high attack rates. Natural language games involve the use of synthetic linguistic constructs and the actions intertwined with these constructs, such as the Ubbi Dubbi language. Building on this phenomenon, we propose the custom language games method: by engaging with LLMs using a variety of custom rules, we successfully execute jailbreak attacks across multiple LLM platforms. Extensive experiments demonstrate the effectiveness of our methods, achieving success rates of 93% on GPT-4o, 89% on GPT-4o-mini and 83% on Claude-3.5-Sonnet. Furthermore, to investigate the generalizability of safety alignments, we fine-tuned Llama-3.1-70B with the custom language games to achieve safety alignment within our datasets and found that when interacting through other language games, the fine-tuned models still failed to identify harmful content. This finding indicates that the safety alignment knowledge embedded in LLMs fails to generalize across different linguistic formats, thus opening new avenues for future research in this area. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12762v2-abstract-full').style.display = 'none'; document.getElementById('2411.12762v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12503">arXiv:2411.12503</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.12503">pdf</a>, <a href="https://arxiv.org/format/2411.12503">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> ManiSkill-ViTac 2025: Challenge on Manipulation Skill Learning With Vision and Tactile Sensing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+C">Chuanyu Li</a>, <a href="/search/cs?searchtype=author&amp;query=Dang%2C+R">Renjun Dang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+X">Xiang Li</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+Z">Zhiyuan Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+J">Jing Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Kasaei%2C+H">Hamidreza Kasaei</a>, <a href="/search/cs?searchtype=author&amp;query=Calandra%2C+R">Roberto Calandra</a>, <a href="/search/cs?searchtype=author&amp;query=Lepora%2C+N">Nathan Lepora</a>, <a href="/search/cs?searchtype=author&amp;query=Luo%2C+S">Shan Luo</a>, <a href="/search/cs?searchtype=author&amp;query=Su%2C+H">Hao Su</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+R">Rui Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12503v1-abstract-short" style="display: inline;"> This article introduces the ManiSkill-ViTac Challenge 2025, which focuses on learning contact-rich manipulation skills using both tactile and visual sensing. Expanding upon the 2024 challenge, ManiSkill-ViTac 2025 includes 3 independent tracks: tactile manipulation, tactile-vision fusion manipulation, and tactile sensor structure design. The challenge aims to push the boundaries of robotic manipul&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12503v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12503v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12503v1-abstract-full" style="display: none;"> This article introduces the ManiSkill-ViTac Challenge 2025, which focuses on learning contact-rich manipulation skills using both tactile and visual sensing. Expanding upon the 2024 challenge, ManiSkill-ViTac 2025 includes 3 independent tracks: tactile manipulation, tactile-vision fusion manipulation, and tactile sensor structure design. The challenge aims to push the boundaries of robotic manipulation skills, emphasizing the integration of tactile and visual data to enhance performance in complex, real-world tasks. Participants will be evaluated using standardized metrics across both simulated and real-world environments, spurring innovations in sensor design and significantly advancing the field of vision-tactile fusion in robotics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12503v1-abstract-full').style.display = 'none'; document.getElementById('2411.12503v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Challenge webpage: https://ai-workshops.github.io/maniskill-vitac-challenge-2025/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12156">arXiv:2411.12156</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.12156">pdf</a>, <a href="https://arxiv.org/format/2411.12156">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> HNCSE: Advancing Sentence Embeddings via Hybrid Contrastive Learning with Hard Negatives </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liu%2C+W">Wenxiao Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+Z">Zihong Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+C">Chaozhuo Li</a>, <a href="/search/cs?searchtype=author&amp;query=Hong%2C+Z">Zijin Hong</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+J">Jianfeng Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zhiquan Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+L">Litian Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+F">Feiran Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12156v1-abstract-short" style="display: inline;"> Unsupervised sentence representation learning remains a critical challenge in modern natural language processing (NLP) research. Recently, contrastive learning techniques have achieved significant success in addressing this issue by effectively capturing textual semantics. Many such approaches prioritize the optimization using negative samples. In fields such as computer vision, hard negative samp&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12156v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12156v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12156v1-abstract-full" style="display: none;"> Unsupervised sentence representation learning remains a critical challenge in modern natural language processing (NLP) research. Recently, contrastive learning techniques have achieved significant success in addressing this issue by effectively capturing textual semantics. Many such approaches prioritize the optimization using negative samples. In fields such as computer vision, hard negative samples (samples that are close to the decision boundary and thus more difficult to distinguish) have been shown to enhance representation learning. However, adapting hard negatives to contrastive sentence learning is complex due to the intricate syntactic and semantic details of text. To address this problem, we propose HNCSE, a novel contrastive learning framework that extends the leading SimCSE approach. The hallmark of HNCSE is its innovative use of hard negative samples to enhance the learning of both positive and negative samples, thereby achieving a deeper semantic understanding. Empirical tests on semantic textual similarity and transfer task datasets validate the superiority of HNCSE. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12156v1-abstract-full').style.display = 'none'; document.getElementById('2411.12156v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11934">arXiv:2411.11934</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.11934">pdf</a>, <a href="https://arxiv.org/format/2411.11934">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> SpatialDreamer: Self-supervised Stereo Video Synthesis from Monocular Input </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Lv%2C+Z">Zhen Lv</a>, <a href="/search/cs?searchtype=author&amp;query=Long%2C+Y">Yangqi Long</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+C">Congzhentao Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+C">Cao Li</a>, <a href="/search/cs?searchtype=author&amp;query=Lv%2C+C">Chengfei Lv</a>, <a href="/search/cs?searchtype=author&amp;query=Ren%2C+H">Hao Ren</a>, <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+D">Dian Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11934v1-abstract-short" style="display: inline;"> Stereo video synthesis from a monocular input is a demanding task in the fields of spatial computing and virtual reality. The main challenges of this task lie on the insufficiency of high-quality paired stereo videos for training and the difficulty of maintaining the spatio-temporal consistency between frames. Existing methods primarily address these issues by directly applying novel view synthesi&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11934v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11934v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11934v1-abstract-full" style="display: none;"> Stereo video synthesis from a monocular input is a demanding task in the fields of spatial computing and virtual reality. The main challenges of this task lie on the insufficiency of high-quality paired stereo videos for training and the difficulty of maintaining the spatio-temporal consistency between frames. Existing methods primarily address these issues by directly applying novel view synthesis (NVS) techniques to video, while facing limitations such as the inability to effectively represent dynamic scenes and the requirement for large amounts of training data. In this paper, we introduce a novel self-supervised stereo video synthesis paradigm via a video diffusion model, termed SpatialDreamer, which meets the challenges head-on. Firstly, to address the stereo video data insufficiency, we propose a Depth based Video Generation module DVG, which employs a forward-backward rendering mechanism to generate paired videos with geometric and temporal priors. Leveraging data generated by DVG, we propose RefinerNet along with a self-supervised synthetic framework designed to facilitate efficient and dedicated training. More importantly, we devise a consistency control module, which consists of a metric of stereo deviation strength and a Temporal Interaction Learning module TIL for geometric and temporal consistency ensurance respectively. We evaluated the proposed method against various benchmark methods, with the results showcasing its superior performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11934v1-abstract-full').style.display = 'none'; document.getElementById('2411.11934v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11903">arXiv:2411.11903</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.11903">pdf</a>, <a href="https://arxiv.org/format/2411.11903">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> DiHuR: Diffusion-Guided Generalizable Human Reconstruction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Chen%2C+J">Jinnan Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+C">Chen Li</a>, <a href="/search/cs?searchtype=author&amp;query=Lee%2C+G+H">Gim Hee Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11903v1-abstract-short" style="display: inline;"> We introduce DiHuR, a novel Diffusion-guided model for generalizable Human 3D Reconstruction and view synthesis from sparse, minimally overlapping images. While existing generalizable human radiance fields excel at novel view synthesis, they often struggle with comprehensive 3D reconstruction. Similarly, directly optimizing implicit Signed Distance Function (SDF) fields from sparse-view images typ&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11903v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11903v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11903v1-abstract-full" style="display: none;"> We introduce DiHuR, a novel Diffusion-guided model for generalizable Human 3D Reconstruction and view synthesis from sparse, minimally overlapping images. While existing generalizable human radiance fields excel at novel view synthesis, they often struggle with comprehensive 3D reconstruction. Similarly, directly optimizing implicit Signed Distance Function (SDF) fields from sparse-view images typically yields poor results due to limited overlap. To enhance 3D reconstruction quality, we propose using learnable tokens associated with SMPL vertices to aggregate sparse view features and then to guide SDF prediction. These tokens learn a generalizable prior across different identities in training datasets, leveraging the consistent projection of SMPL vertices onto similar semantic areas across various human identities. This consistency enables effective knowledge transfer to unseen identities during inference. Recognizing SMPL&#39;s limitations in capturing clothing details, we incorporate a diffusion model as an additional prior to fill in missing information, particularly for complex clothing geometries. Our method integrates two key priors in a coherent manner: the prior from generalizable feed-forward models and the 2D diffusion prior, and it requires only multi-view image training, without 3D supervision. DiHuR demonstrates superior performance in both within-dataset and cross-dataset generalization settings, as validated on THuman, ZJU-MoCap, and HuMMan datasets compared to existing methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11903v1-abstract-full').style.display = 'none'; document.getElementById('2411.11903v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to WACV 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11623">arXiv:2411.11623</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.11623">pdf</a>, <a href="https://arxiv.org/format/2411.11623">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Federated Incremental Named Entity Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+D">Duzhen Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+Y">Yahan Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+C">Chenxing Li</a>, <a href="/search/cs?searchtype=author&amp;query=Dong%2C+J">Jiahua Dong</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+D">Dong Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11623v1-abstract-short" style="display: inline;"> Federated Named Entity Recognition (FNER) boosts model training within each local client by aggregating the model updates of decentralized local clients, without sharing their private data. However, existing FNER methods assume fixed entity types and local clients in advance, leading to their ineffectiveness in practical applications. In a more realistic scenario, local clients receive new entity&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11623v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11623v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11623v1-abstract-full" style="display: none;"> Federated Named Entity Recognition (FNER) boosts model training within each local client by aggregating the model updates of decentralized local clients, without sharing their private data. However, existing FNER methods assume fixed entity types and local clients in advance, leading to their ineffectiveness in practical applications. In a more realistic scenario, local clients receive new entity types continuously, while new local clients collecting novel data may irregularly join the global FNER training. This challenging setup, referred to here as Federated Incremental NER, renders the global model suffering from heterogeneous forgetting of old entity types from both intra-client and inter-client perspectives. To overcome these challenges, we propose a Local-Global Forgetting Defense (LGFD) model. Specifically, to address intra-client forgetting, we develop a structural knowledge distillation loss to retain the latent space&#39;s feature structure and a pseudo-label-guided inter-type contrastive loss to enhance discriminative capability over different entity types, effectively preserving previously learned knowledge within local clients. To tackle inter-client forgetting, we propose a task switching monitor that can automatically identify new entity types under privacy protection and store the latest old global model for knowledge distillation and pseudo-labeling. Experiments demonstrate significant improvement of our LGFD model over comparison methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11623v1-abstract-full').style.display = 'none'; document.getElementById('2411.11623v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Under Review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11435">arXiv:2411.11435</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.11435">pdf</a>, <a href="https://arxiv.org/format/2411.11435">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> GLDesigner: Leveraging Multi-Modal LLMs as Designer for Enhanced Aesthetic Text Glyph Layouts </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=He%2C+J">Junwen He</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yifan Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+L">Lijun Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+H">Huchuan Lu</a>, <a href="/search/cs?searchtype=author&amp;query=He%2C+J">Jun-Yan He</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+C">Chenyang Li</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+H">Hanyuan Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Lan%2C+J">Jin-Peng Lan</a>, <a href="/search/cs?searchtype=author&amp;query=Luo%2C+B">Bin Luo</a>, <a href="/search/cs?searchtype=author&amp;query=Geng%2C+Y">Yifeng Geng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11435v1-abstract-short" style="display: inline;"> Text logo design heavily relies on the creativity and expertise of professional designers, in which arranging element layouts is one of the most important procedures. However, few attention has been paid to this specific task which needs to take precise textural details and user constraints into consideration, but only on the broader tasks such as document/poster layout generation. In this paper,&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11435v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11435v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11435v1-abstract-full" style="display: none;"> Text logo design heavily relies on the creativity and expertise of professional designers, in which arranging element layouts is one of the most important procedures. However, few attention has been paid to this specific task which needs to take precise textural details and user constraints into consideration, but only on the broader tasks such as document/poster layout generation. In this paper, we propose a VLM-based framework that generates content-aware text logo layouts by integrating multi-modal inputs with user constraints, supporting a more flexible and stable layout design in real-world applications. We introduce two model techniques to reduce the computation for processing multiple glyph images simultaneously, while does not face performance degradation. To support instruction-tuning of out model, we construct two extensive text logo datasets, which are 5x more larger than the existing public dataset. Except for the geometric annotations (e.g. text masks and character recognition), we also compliment with comprehensive layout descriptions in natural language format, for more effective training to have reasoning ability when dealing with complex layouts and custom user constraints. Experimental studies demonstrate the effectiveness of our proposed model and datasets, when comparing with previous methods in various benchmarks to evaluate geometric aesthetics and human preferences. The code and datasets will be publicly available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11435v1-abstract-full').style.display = 'none'; document.getElementById('2411.11435v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11396">arXiv:2411.11396</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.11396">pdf</a>, <a href="https://arxiv.org/format/2411.11396">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Stacking Brick by Brick: Aligned Feature Isolation for Incremental Face Forgery Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Cheng%2C+J">Jikang Cheng</a>, <a href="/search/cs?searchtype=author&amp;query=Yan%2C+Z">Zhiyuan Yan</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Ying Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Hao%2C+L">Li Hao</a>, <a href="/search/cs?searchtype=author&amp;query=Ai%2C+J">Jiaxin Ai</a>, <a href="/search/cs?searchtype=author&amp;query=Zou%2C+Q">Qin Zou</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+C">Chen Li</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Z">Zhongyuan Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11396v2-abstract-short" style="display: inline;"> The rapid advancement of face forgery techniques has introduced a growing variety of forgeries. Incremental Face Forgery Detection (IFFD), involving gradually adding new forgery data to fine-tune the previously trained model, has been introduced as a promising strategy to deal with evolving forgery methods. However, a naively trained IFFD model is prone to catastrophic forgetting when new forgerie&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11396v2-abstract-full').style.display = 'inline'; document.getElementById('2411.11396v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11396v2-abstract-full" style="display: none;"> The rapid advancement of face forgery techniques has introduced a growing variety of forgeries. Incremental Face Forgery Detection (IFFD), involving gradually adding new forgery data to fine-tune the previously trained model, has been introduced as a promising strategy to deal with evolving forgery methods. However, a naively trained IFFD model is prone to catastrophic forgetting when new forgeries are integrated, as treating all forgeries as a single &#39;&#39;Fake&#34; class in the Real/Fake classification can cause different forgery types overriding one another, thereby resulting in the forgetting of unique characteristics from earlier tasks and limiting the model&#39;s effectiveness in learning forgery specificity and generality. In this paper, we propose to stack the latent feature distributions of previous and new tasks brick by brick, $\textit{i.e.}$, achieving $\textbf{aligned feature isolation}$. In this manner, we aim to preserve learned forgery information and accumulate new knowledge by minimizing distribution overriding, thereby mitigating catastrophic forgetting. To achieve this, we first introduce Sparse Uniform Replay (SUR) to obtain the representative subsets that could be treated as the uniformly sparse versions of the previous global distributions. We then propose a Latent-space Incremental Detector (LID) that leverages SUR data to isolate and align distributions. For evaluation, we construct a more advanced and comprehensive benchmark tailored for IFFD. The leading experimental results validate the superiority of our method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11396v2-abstract-full').style.display = 'none'; document.getElementById('2411.11396v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11116">arXiv:2411.11116</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.11116">pdf</a>, <a href="https://arxiv.org/ps/2411.11116">ps</a>, <a href="https://arxiv.org/format/2411.11116">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> DBF-Net: A Dual-Branch Network with Feature Fusion for Ultrasound Image Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Xu%2C+G">Guoping Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+X">Ximing Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Liao%2C+W">Wentao Liao</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+X">Xinglong Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+Q">Qing Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+C">Chang Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11116v1-abstract-short" style="display: inline;"> Accurately segmenting lesions in ultrasound images is challenging due to the difficulty in distinguishing boundaries between lesions and surrounding tissues. While deep learning has improved segmentation accuracy, there is limited focus on boundary quality and its relationship with body structures. To address this, we introduce UBBS-Net, a dual-branch deep neural network that learns the relationsh&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11116v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11116v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11116v1-abstract-full" style="display: none;"> Accurately segmenting lesions in ultrasound images is challenging due to the difficulty in distinguishing boundaries between lesions and surrounding tissues. While deep learning has improved segmentation accuracy, there is limited focus on boundary quality and its relationship with body structures. To address this, we introduce UBBS-Net, a dual-branch deep neural network that learns the relationship between body and boundary for improved segmentation. We also propose a feature fusion module to integrate body and boundary information. Evaluated on three public datasets, UBBS-Net outperforms existing methods, achieving Dice Similarity Coefficients of 81.05% for breast cancer, 76.41% for brachial plexus nerves, and 87.75% for infantile hemangioma segmentation. Our results demonstrate the effectiveness of UBBS-Net for ultrasound image segmentation. The code is available at https://github.com/apple1986/DBF-Net. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11116v1-abstract-full').style.display = 'none'; document.getElementById('2411.11116v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10770">arXiv:2411.10770</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.10770">pdf</a>, <a href="https://arxiv.org/format/2411.10770">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Task Offloading for Vehicular Edge Computing Based on Improved Hotstuff under Parking Assistance </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liang%2C+G">Guoling Liang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+C">Chunhai Li</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+F">Feng Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+C">Chuan Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+L">Liehuang Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10770v1-abstract-short" style="display: inline;"> Parked-assisted vehicular edge computing (PVEC) fully leverages communication and computing resources of parking vehicles, thereby significantly alleviating the pressure on edge servers. However, resource sharing and trading for vehicular task offloading in the PVEC environment usually occur between untrustworthy entities, which compromises the security of data sharing and transactions by vehicles&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10770v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10770v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10770v1-abstract-full" style="display: none;"> Parked-assisted vehicular edge computing (PVEC) fully leverages communication and computing resources of parking vehicles, thereby significantly alleviating the pressure on edge servers. However, resource sharing and trading for vehicular task offloading in the PVEC environment usually occur between untrustworthy entities, which compromises the security of data sharing and transactions by vehicles and edge devices. To address these concerns, blockchain is introduced to provide a secure and trustworthy environment for offloading and transactions in PVEC. Nevertheless, due to the mobility of the vehicles, the processes of computing offloading and blockchain transactions are interrupted, which greatly reduces the reliability of the blockchain in edge computing process. In this paper, we propose a blockchain-based PVEC (BPVEC) offloading framework to enhance the security and reliability of the task offloading and transaction. Specifically, a consensus node selection algorithm based on the connected dominating set (CDS) is designed to improve the Hotstuff consensus according to parking time, computing capability and communication quality, which enhances blockchain reliability in computing offloading and transactions. Meanwhile, a Stackelberg game model, establishing the roadside units (RSUs) and parking vehicles (PVs) as leaders and the requesting vehicles (RVs) as follower, is utilized to optimize the offloading strategy and pricing. Subsequently, a BPVEC offloading strategy algorithm with gradient descent method is designed to maximize system revenue. Simulation results show that the proposed BPVEC offloading scheme is secure and reliable while ensuring maximum benefits. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10770v1-abstract-full').style.display = 'none'; document.getElementById('2411.10770v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10680">arXiv:2411.10680</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.10680">pdf</a>, <a href="https://arxiv.org/format/2411.10680">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Two-layer consensus based on master-slave consortium chain data sharing for Internet of Vehicles </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+F">Feng Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+B">Benchang Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+C">Chunhai Li</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+C">Chuan Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+L">Liehuang Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Liang%2C+G">Guoling Liang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10680v1-abstract-short" style="display: inline;"> Due to insufficient scalability, the existing consortium chain cannot meet the requirements of low latency, high throughput, and high security when applied to Internet of Vehicles (IoV) data sharing. Therefore, we propose a two-layer consensus algorithm based on the master-slave consortium chain - Weighted Raft and Byzantine Fault Tolerance (WRBFT). The intra-group consensus of the WRBFT algorithm&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10680v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10680v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10680v1-abstract-full" style="display: none;"> Due to insufficient scalability, the existing consortium chain cannot meet the requirements of low latency, high throughput, and high security when applied to Internet of Vehicles (IoV) data sharing. Therefore, we propose a two-layer consensus algorithm based on the master-slave consortium chain - Weighted Raft and Byzantine Fault Tolerance (WRBFT). The intra-group consensus of the WRBFT algorithm adopts weighted Raft, and the best node is selected as the master node to lead the intra-group consensus by comprehensively evaluating the signal-to-noise ratio (SNR), data processing capacity and storage capacity of the nodes. The inter-group consensus adopts practical Byzantine fault tolerance (PBFT) based on BLS aggregate signature with nonlinear coefficients to ensure that the inter-group consensus can tolerate 1/3 of Byzantine nodes. At the same time, the verifiable random function (VRF) is used to select the master node of the inter-group consensus to ensure the randomness of the master node. A large number of experimental results show that the proposed WRBFT algorithm reduces delay, and improves throughput and system security. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10680v1-abstract-full').style.display = 'none'; document.getElementById('2411.10680v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10666">arXiv:2411.10666</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.10666">pdf</a>, <a href="https://arxiv.org/format/2411.10666">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> SAM Decoding: Speculative Decoding via Suffix Automaton </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Hu%2C+Y">Yuxuan Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+K">Ke Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+J">Jing Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+C">Cuiping Li</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+H">Hong Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10666v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) have revolutionized natural language processing by unifying tasks into text generation, yet their large parameter sizes and autoregressive nature limit inference speed. SAM-Decoding addresses this by introducing a novel retrieval-based speculative decoding method that uses a suffix automaton for efficient and accurate draft generation. Unlike n-gram matching used by th&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10666v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10666v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10666v1-abstract-full" style="display: none;"> Large Language Models (LLMs) have revolutionized natural language processing by unifying tasks into text generation, yet their large parameter sizes and autoregressive nature limit inference speed. SAM-Decoding addresses this by introducing a novel retrieval-based speculative decoding method that uses a suffix automaton for efficient and accurate draft generation. Unlike n-gram matching used by the existing method, SAM-Decoding finds the longest suffix match in generating text and text corpuss, achieving an average time complexity of $O(1)$ per generation step. SAM-Decoding constructs static and dynamic suffix automatons for the text corpus and input prompts, respectively, enabling fast and precise draft generation. Meanwhile, it is designed as an approach that can be combined with existing methods, allowing SAM-Decoding to adaptively select a draft generation strategy based on the matching length, thus increasing the inference speed of the LLM. When combined with Token Recycling, evaluations show SAM-Decoding outperforms existing model-free methods, achieving a speedup of $2.27\times$ over autoregressive decoding on Spec-Bench. When combined with EAGLE2, it reaches a speedup of $2.49\times$, surpassing all current approaches. Our code is available at https://github.com/hyx1999/SAM-Decoding. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10666v1-abstract-full').style.display = 'none'; document.getElementById('2411.10666v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 3 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.7 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10498">arXiv:2411.10498</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.10498">pdf</a>, <a href="https://arxiv.org/format/2411.10498">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Prompt-Guided Environmentally Consistent Adversarial Patch </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+C">Chaoqun Li</a>, <a href="/search/cs?searchtype=author&amp;query=Yan%2C+H">Huanqian Yan</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+L">Lifeng Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+T">Tairan Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zhuodong Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Su%2C+H">Hang Su</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10498v1-abstract-short" style="display: inline;"> Adversarial attacks in the physical world pose a significant threat to the security of vision-based systems, such as facial recognition and autonomous driving. Existing adversarial patch methods primarily focus on improving attack performance, but they often produce patches that are easily detectable by humans and struggle to achieve environmental consistency, i.e., blending patches into the envir&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10498v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10498v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10498v1-abstract-full" style="display: none;"> Adversarial attacks in the physical world pose a significant threat to the security of vision-based systems, such as facial recognition and autonomous driving. Existing adversarial patch methods primarily focus on improving attack performance, but they often produce patches that are easily detectable by humans and struggle to achieve environmental consistency, i.e., blending patches into the environment. This paper introduces a novel approach for generating adversarial patches, which addresses both the visual naturalness and environmental consistency of the patches. We propose Prompt-Guided Environmentally Consistent Adversarial Patch (PG-ECAP), a method that aligns the patch with the environment to ensure seamless integration into the environment. The approach leverages diffusion models to generate patches that are both environmental consistency and effective in evading detection. To further enhance the naturalness and consistency, we introduce two alignment losses: Prompt Alignment Loss and Latent Space Alignment Loss, ensuring that the generated patch maintains its adversarial properties while fitting naturally within its environment. Extensive experiments in both digital and physical domains demonstrate that PG-ECAP outperforms existing methods in attack success rate and environmental consistency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10498v1-abstract-full').style.display = 'none'; document.getElementById('2411.10498v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10272">arXiv:2411.10272</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.10272">pdf</a>, <a href="https://arxiv.org/format/2411.10272">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Scaling Law for Post-training after Model Pruning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xiaodong Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+Y">Yuxuan Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+J">Jing Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+X">Xiaokang Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+C">Cuiping Li</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+H">Hong Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10272v1-abstract-short" style="display: inline;"> Large language models (LLMs) based on the Transformer architecture are widely employed across various domains and tasks. However, their increasing size imposes significant hardware demands, limiting practical deployment. To mitigate this, model pruning techniques have been developed to create more efficient models while maintaining high performance. Despite this, post-training after pruning is cru&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10272v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10272v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10272v1-abstract-full" style="display: none;"> Large language models (LLMs) based on the Transformer architecture are widely employed across various domains and tasks. However, their increasing size imposes significant hardware demands, limiting practical deployment. To mitigate this, model pruning techniques have been developed to create more efficient models while maintaining high performance. Despite this, post-training after pruning is crucial for performance recovery and can be resource-intensive. This paper investigates the post-training requirements of pruned LLMs and introduces a scaling law to determine the optimal amount of post-training data. Post-training experiments with the Llama-3 and Qwen-2.5 series models, pruned using depth pruning, width pruning, and 2:4 semi-structured pruning, show that higher pruning ratios necessitate more post-training data for performance recovery, whereas larger LLMs require less. The proposed scaling law predicts a model&#39;s loss based on its parameter counts before and after pruning, as well as the post-training token counts. Furthermore, we find that the scaling law established from smaller LLMs can be reliably extrapolated to larger LLMs. This work provides valuable insights into the post-training of pruned LLMs and offers a practical scaling law for optimizing post-training data usage. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10272v1-abstract-full').style.display = 'none'; document.getElementById('2411.10272v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10232">arXiv:2411.10232</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.10232">pdf</a>, <a href="https://arxiv.org/format/2411.10232">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> ColorEdit: Training-free Image-Guided Color editing with diffusion model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yin%2C+X">Xingxi Yin</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Z">Zhi Li</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+J">Jingfeng Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+C">Chenglin Li</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Yin Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10232v1-abstract-short" style="display: inline;"> Text-to-image (T2I) diffusion models, with their impressive generative capabilities, have been adopted for image editing tasks, demonstrating remarkable efficacy. However, due to attention leakage and collision between the cross-attention map of the object and the new color attribute from the text prompt, text-guided image editing methods may fail to change the color of an object, resulting in a m&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10232v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10232v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10232v1-abstract-full" style="display: none;"> Text-to-image (T2I) diffusion models, with their impressive generative capabilities, have been adopted for image editing tasks, demonstrating remarkable efficacy. However, due to attention leakage and collision between the cross-attention map of the object and the new color attribute from the text prompt, text-guided image editing methods may fail to change the color of an object, resulting in a misalignment between the resulting image and the text prompt. In this paper, we conduct an in-depth analysis on the process of text-guided image synthesizing and what semantic information different cross-attention blocks have learned. We observe that the visual representation of an object is determined in the up-block of the diffusion model in the early stage of the denoising process, and color adjustment can be achieved through value matrices alignment in the cross-attention layer. Based on our findings, we propose a straightforward, yet stable, and effective image-guided method to modify the color of an object without requiring any additional fine-tuning or training. Lastly, we present a benchmark dataset called COLORBENCH, the first benchmark to evaluate the performance of color change methods. Extensive experiments validate the effectiveness of our method in object-level color editing and surpass the performance of popular text-guided image editing approaches in both synthesized and real images. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10232v1-abstract-full').style.display = 'none'; document.getElementById('2411.10232v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10189">arXiv:2411.10189</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.10189">pdf</a>, <a href="https://arxiv.org/format/2411.10189">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> NeISF++: Neural Incident Stokes Field for Polarized Inverse Rendering of Conductors and Dielectrics </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+C">Chenhao Li</a>, <a href="/search/cs?searchtype=author&amp;query=Ono%2C+T">Taishi Ono</a>, <a href="/search/cs?searchtype=author&amp;query=Uemori%2C+T">Takeshi Uemori</a>, <a href="/search/cs?searchtype=author&amp;query=Nitta%2C+S">Sho Nitta</a>, <a href="/search/cs?searchtype=author&amp;query=Mihara%2C+H">Hajime Mihara</a>, <a href="/search/cs?searchtype=author&amp;query=Gatto%2C+A">Alexander Gatto</a>, <a href="/search/cs?searchtype=author&amp;query=Nagahara%2C+H">Hajime Nagahara</a>, <a href="/search/cs?searchtype=author&amp;query=Moriuchi%2C+Y">Yusuke Moriuchi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10189v1-abstract-short" style="display: inline;"> Recent inverse rendering methods have greatly improved shape, material, and illumination reconstruction by utilizing polarization cues. However, existing methods only support dielectrics, ignoring conductors that are found everywhere in life. Since conductors and dielectrics have different reflection properties, using previous conductor methods will lead to obvious errors. In addition, conductors&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10189v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10189v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10189v1-abstract-full" style="display: none;"> Recent inverse rendering methods have greatly improved shape, material, and illumination reconstruction by utilizing polarization cues. However, existing methods only support dielectrics, ignoring conductors that are found everywhere in life. Since conductors and dielectrics have different reflection properties, using previous conductor methods will lead to obvious errors. In addition, conductors are glossy, which may cause strong specular reflection and is hard to reconstruct. To solve the above issues, we propose NeISF++, an inverse rendering pipeline that supports conductors and dielectrics. The key ingredient for our proposal is a general pBRDF that describes both conductors and dielectrics. As for the strong specular reflection problem, we propose a novel geometry initialization method using DoLP images. This physical cue is invariant to intensities and thus robust to strong specular reflections. Experimental results on our synthetic and real datasets show that our method surpasses the existing polarized inverse rendering methods for geometry and material decomposition as well as downstream tasks like relighting. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10189v1-abstract-full').style.display = 'none'; document.getElementById('2411.10189v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09702">arXiv:2411.09702</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.09702">pdf</a>, <a href="https://arxiv.org/format/2411.09702">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> </div> </div> <p class="title is-5 mathjax"> On the Surprising Effectiveness of Attention Transfer for Vision Transformers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+A+C">Alexander C. Li</a>, <a href="/search/cs?searchtype=author&amp;query=Tian%2C+Y">Yuandong Tian</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+B">Beidi Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Pathak%2C+D">Deepak Pathak</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xinlei Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09702v1-abstract-short" style="display: inline;"> Conventional wisdom suggests that pre-training Vision Transformers (ViT) improves downstream performance by learning useful representations. Is this actually true? We investigate this question and find that the features and representations learned during pre-training are not essential. Surprisingly, using only the attention patterns from pre-training (i.e., guiding how information flows between to&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09702v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09702v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09702v1-abstract-full" style="display: none;"> Conventional wisdom suggests that pre-training Vision Transformers (ViT) improves downstream performance by learning useful representations. Is this actually true? We investigate this question and find that the features and representations learned during pre-training are not essential. Surprisingly, using only the attention patterns from pre-training (i.e., guiding how information flows between tokens) is sufficient for models to learn high quality features from scratch and achieve comparable downstream performance. We show this by introducing a simple method called attention transfer, where only the attention patterns from a pre-trained teacher ViT are transferred to a student, either by copying or distilling the attention maps. Since attention transfer lets the student learn its own features, ensembling it with a fine-tuned teacher also further improves accuracy on ImageNet. We systematically study various aspects of our findings on the sufficiency of attention maps, including distribution shift settings where they underperform fine-tuning. We hope our exploration provides a better understanding of what pre-training accomplishes and leads to a useful alternative to the standard practice of fine-tuning <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09702v1-abstract-full').style.display = 'none'; document.getElementById('2411.09702v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2024. Code: https://github.com/alexlioralexli/attention-transfer</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09105">arXiv:2411.09105</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.09105">pdf</a>, <a href="https://arxiv.org/format/2411.09105">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> VCBench: A Controllable Benchmark for Symbolic and Abstract Challenges in Video Cognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+C">Chenglin Li</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Q">Qianglong Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Z">Zhi Li</a>, <a href="/search/cs?searchtype=author&amp;query=Tao%2C+F">Feng Tao</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Yin Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09105v1-abstract-short" style="display: inline;"> Recent advancements in Large Video-Language Models (LVLMs) have driven the development of benchmarks designed to assess cognitive abilities in video-based tasks. However, most existing benchmarks heavily rely on web-collected videos paired with human annotations or model-generated questions, which limit control over the video content and fall short in evaluating advanced cognitive abilities involv&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09105v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09105v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09105v1-abstract-full" style="display: none;"> Recent advancements in Large Video-Language Models (LVLMs) have driven the development of benchmarks designed to assess cognitive abilities in video-based tasks. However, most existing benchmarks heavily rely on web-collected videos paired with human annotations or model-generated questions, which limit control over the video content and fall short in evaluating advanced cognitive abilities involving symbolic elements and abstract concepts. To address these limitations, we introduce VCBench, a controllable benchmark to assess LVLMs&#39; cognitive abilities, involving symbolic and abstract concepts at varying difficulty levels. By generating video data with the Python-based engine, VCBench allows for precise control over the video content, creating dynamic, task-oriented videos that feature complex scenes and abstract concepts. Each task pairs with tailored question templates that target specific cognitive challenges, providing a rigorous evaluation test. Our evaluation reveals that even state-of-the-art (SOTA) models, such as Qwen2-VL-72B, struggle with simple video cognition tasks involving abstract concepts, with performance sharply dropping by 19% as video complexity rises. These findings reveal the current limitations of LVLMs in advanced cognitive tasks and highlight the critical role of VCBench in driving research toward more robust LVLMs for complex video cognition challenges. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09105v1-abstract-full').style.display = 'none'; document.getElementById('2411.09105v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.08370">arXiv:2411.08370</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.08370">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> A Fuzzy Reinforcement LSTM-based Long-term Prediction Model for Fault Conditions in Nuclear Power Plants </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+S">Siwei Li</a>, <a href="/search/cs?searchtype=author&amp;query=Fang%2C+J">Jiayan Fang</a>, <a href="/search/cs?searchtype=author&amp;query=Wua%2C+Y">Yichun Wua</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+W">Wei Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+C">Chengxin Li</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+J">Jiangwen Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.08370v1-abstract-short" style="display: inline;"> Early fault detection and timely maintenance scheduling can significantly mitigate operational risks in NPPs and enhance the reliability of operator decision-making. Therefore, it is necessary to develop an efficient Prognostics and Health Management (PHM) multi-step prediction model for predicting of system health status and prompt execution of maintenance operations. In this study, we propose a&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08370v1-abstract-full').style.display = 'inline'; document.getElementById('2411.08370v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.08370v1-abstract-full" style="display: none;"> Early fault detection and timely maintenance scheduling can significantly mitigate operational risks in NPPs and enhance the reliability of operator decision-making. Therefore, it is necessary to develop an efficient Prognostics and Health Management (PHM) multi-step prediction model for predicting of system health status and prompt execution of maintenance operations. In this study, we propose a novel predictive model that integrates reinforcement learning with Long Short-Term Memory (LSTM) neural networks and the Expert Fuzzy Evaluation Method. The model is validated using parameter data for 20 different breach sizes in the Main Steam Line Break (MSLB) accident condition of the CPR1000 pressurized water reactor simulation model and it demonstrates a remarkable capability in accurately forecasting NPP parameter changes up to 128 steps ahead (with a time interval of 10 seconds per step, i.e., 1280 seconds), thereby satisfying the temporal advance requirement for fault prognostics in NPPs. Furthermore, this method provides an effective reference solution for PHM applications such as anomaly detection and remaining useful life prediction. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08370v1-abstract-full').style.display = 'none'; document.getElementById('2411.08370v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.08305">arXiv:2411.08305</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.08305">pdf</a>, <a href="https://arxiv.org/format/2411.08305">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Robust Divergence Learning for Missing-Modality Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Cheng%2C+R">Runze Cheng</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+Z">Zhongao Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Ye Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+C">Chun Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.08305v1-abstract-short" style="display: inline;"> Multimodal Magnetic Resonance Imaging (MRI) provides essential complementary information for analyzing brain tumor subregions. While methods using four common MRI modalities for automatic segmentation have shown success, they often face challenges with missing modalities due to image quality issues, inconsistent protocols, allergic reactions, or cost factors. Thus, developing a segmentation paradi&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08305v1-abstract-full').style.display = 'inline'; document.getElementById('2411.08305v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.08305v1-abstract-full" style="display: none;"> Multimodal Magnetic Resonance Imaging (MRI) provides essential complementary information for analyzing brain tumor subregions. While methods using four common MRI modalities for automatic segmentation have shown success, they often face challenges with missing modalities due to image quality issues, inconsistent protocols, allergic reactions, or cost factors. Thus, developing a segmentation paradigm that handles missing modalities is clinically valuable. A novel single-modality parallel processing network framework based on H枚lder divergence and mutual information is introduced. Each modality is independently input into a shared network backbone for parallel processing, preserving unique information. Additionally, a dynamic sharing framework is introduced that adjusts network parameters based on modality availability. A H枚lder divergence and mutual information-based loss functions are used for evaluating discrepancies between predictions and labels. Extensive testing on the BraTS 2018 and BraTS 2020 datasets demonstrates that our method outperforms existing techniques in handling missing modalities and validates each component&#39;s effectiveness. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08305v1-abstract-full').style.display = 'none'; document.getElementById('2411.08305v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07111">arXiv:2411.07111</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.07111">pdf</a>, <a href="https://arxiv.org/format/2411.07111">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Building a Taiwanese Mandarin Spoken Language Model: A First Attempt </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yang%2C+C">Chih-Kai Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Fu%2C+Y">Yu-Kuan Fu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+C">Chen-An Li</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+Y">Yi-Cheng Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+Y">Yu-Xiang Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+W">Wei-Chih Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Chung%2C+H+L">Ho Lam Chung</a>, <a href="/search/cs?searchtype=author&amp;query=Kuan%2C+C">Chun-Yi Kuan</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+W">Wei-Ping Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+K">Ke-Han Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+T">Tzu-Quan Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+H">Hsiu-Hsuan Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+E">En-Pei Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Hsu%2C+C">Chan-Jan Hsu</a>, <a href="/search/cs?searchtype=author&amp;query=Tseng%2C+L">Liang-Hsuan Tseng</a>, <a href="/search/cs?searchtype=author&amp;query=Chiu%2C+I">I-Hsiang Chiu</a>, <a href="/search/cs?searchtype=author&amp;query=Sanga%2C+U">Ulin Sanga</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xuanjun Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Hsu%2C+P">Po-chun Hsu</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+S">Shu-wen Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Lee%2C+H">Hung-yi Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07111v1-abstract-short" style="display: inline;"> This technical report presents our initial attempt to build a spoken large language model (LLM) for Taiwanese Mandarin, specifically tailored to enable real-time, speech-to-speech interaction in multi-turn conversations. Our end-to-end model incorporates a decoder-only transformer architecture and aims to achieve seamless interaction while preserving the conversational flow, including full-duplex&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07111v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07111v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07111v1-abstract-full" style="display: none;"> This technical report presents our initial attempt to build a spoken large language model (LLM) for Taiwanese Mandarin, specifically tailored to enable real-time, speech-to-speech interaction in multi-turn conversations. Our end-to-end model incorporates a decoder-only transformer architecture and aims to achieve seamless interaction while preserving the conversational flow, including full-duplex capabilities allowing simultaneous speaking and listening. The paper also details the training process, including data preparation with synthesized dialogues and adjustments for real-time interaction. We also developed a platform to evaluate conversational fluency and response coherence in multi-turn dialogues. We hope the release of the report can contribute to the future development of spoken LLMs in Taiwanese Mandarin. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07111v1-abstract-full').style.display = 'none'; document.getElementById('2411.07111v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Work in progress</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06893">arXiv:2411.06893</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.06893">pdf</a>, <a href="https://arxiv.org/format/2411.06893">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Multi-scale Frequency Enhancement Network for Blind Image Deblurring </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Xiang%2C+Y">Yawen Xiang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+H">Heng Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+C">Chengyang Li</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Z">Zhongbo Li</a>, <a href="/search/cs?searchtype=author&amp;query=Xie%2C+Y">Yongqiang Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06893v1-abstract-short" style="display: inline;"> Image deblurring is an essential image preprocessing technique, aiming to recover clear and detailed images form blurry ones. However, existing algorithms often fail to effectively integrate multi-scale feature extraction with frequency enhancement, limiting their ability to reconstruct fine textures. Additionally, non-uniform blur in images also restricts the effectiveness of image restoration. T&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06893v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06893v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06893v1-abstract-full" style="display: none;"> Image deblurring is an essential image preprocessing technique, aiming to recover clear and detailed images form blurry ones. However, existing algorithms often fail to effectively integrate multi-scale feature extraction with frequency enhancement, limiting their ability to reconstruct fine textures. Additionally, non-uniform blur in images also restricts the effectiveness of image restoration. To address these issues, we propose a multi-scale frequency enhancement network (MFENet) for blind image deblurring. To capture the multi-scale spatial and channel information of blurred images, we introduce a multi-scale feature extraction module (MS-FE) based on depthwise separable convolutions, which provides rich target features for deblurring. We propose a frequency enhanced blur perception module (FEBP) that employs wavelet transforms to extract high-frequency details and utilizes multi-strip pooling to perceive non-uniform blur, combining multi-scale information with frequency enhancement to improve the restoration of image texture details. Experimental results on the GoPro and HIDE datasets demonstrate that the proposed method achieves superior deblurring performance in both visual quality and objective evaluation metrics. Furthermore, in downstream object detection tasks, the proposed blind image deblurring algorithm significantly improves detection accuracy, further validating its effectiveness androbustness in the field of image deblurring. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06893v1-abstract-full').style.display = 'none'; document.getElementById('2411.06893v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06786">arXiv:2411.06786</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.06786">pdf</a>, <a href="https://arxiv.org/format/2411.06786">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> ScaleKD: Strong Vision Transformers Could Be Excellent Teachers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Fan%2C+J">Jiawei Fan</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+C">Chao Li</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+X">Xiaolong Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Yao%2C+A">Anbang Yao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06786v1-abstract-short" style="display: inline;"> In this paper, we question if well pre-trained vision transformer (ViT) models could be used as teachers that exhibit scalable properties to advance cross architecture knowledge distillation (KD) research, in the context of using large-scale datasets for evaluation. To make this possible, our analysis underlines the importance of seeking effective strategies to align (1) feature computing paradigm&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06786v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06786v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06786v1-abstract-full" style="display: none;"> In this paper, we question if well pre-trained vision transformer (ViT) models could be used as teachers that exhibit scalable properties to advance cross architecture knowledge distillation (KD) research, in the context of using large-scale datasets for evaluation. To make this possible, our analysis underlines the importance of seeking effective strategies to align (1) feature computing paradigm differences, (2) model scale differences, and (3) knowledge density differences. By combining three coupled components namely cross attention projector, dual-view feature mimicking and teacher parameter perception tailored to address the above problems, we present a simple and effective KD method, called ScaleKD. Our method can train student backbones that span across a variety of convolutional neural network (CNN), multi-layer perceptron (MLP), and ViT architectures on image classification datasets, achieving state-of-the-art distillation performance. For instance, taking a well pre-trained Swin-L as the teacher model, our method gets 75.15%|82.03%|84.16%|78.63%|81.96%|83.93%|83.80%|85.53% top-1 accuracies for MobileNet-V1|ResNet-50|ConvNeXt-T|Mixer-S/16|Mixer-B/16|ViT-S/16|Swin-T|ViT-B/16 models trained on ImageNet-1K dataset from scratch, showing 3.05%|3.39%|2.02%|4.61%|5.52%|4.03%|2.62%|3.73% absolute gains to the individually trained counterparts. Intriguingly, when scaling up the size of teacher models or their pre-training datasets, our method showcases the desired scalable properties, bringing increasingly larger gains to student models. The student backbones trained by our method transfer well on downstream MS-COCO and ADE20K datasets. More importantly, our method could be used as a more efficient alternative to the time-intensive pre-training paradigm for any target student model if a strong pre-trained ViT is available, reducing the amount of viewed training samples up to 195x. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06786v1-abstract-full').style.display = 'none'; document.getElementById('2411.06786v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This work is accepted to NeurIPS 2024. The project page: https://github.com/deep-optimization/ScaleKD</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06493">arXiv:2411.06493</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.06493">pdf</a>, <a href="https://arxiv.org/format/2411.06493">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> LProtector: An LLM-driven Vulnerability Detection System </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Sheng%2C+Z">Ze Sheng</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+F">Fenghua Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Zuo%2C+X">Xiangwu Zuo</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+C">Chao Li</a>, <a href="/search/cs?searchtype=author&amp;query=Qiao%2C+Y">Yuxin Qiao</a>, <a href="/search/cs?searchtype=author&amp;query=Hang%2C+L">Lei Hang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06493v2-abstract-short" style="display: inline;"> This paper presents LProtector, an automated vulnerability detection system for C/C++ codebases driven by the large language model (LLM) GPT-4o and Retrieval-Augmented Generation (RAG). As software complexity grows, traditional methods face challenges in detecting vulnerabilities effectively. LProtector leverages GPT-4o&#39;s powerful code comprehension and generation capabilities to perform binary cl&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06493v2-abstract-full').style.display = 'inline'; document.getElementById('2411.06493v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06493v2-abstract-full" style="display: none;"> This paper presents LProtector, an automated vulnerability detection system for C/C++ codebases driven by the large language model (LLM) GPT-4o and Retrieval-Augmented Generation (RAG). As software complexity grows, traditional methods face challenges in detecting vulnerabilities effectively. LProtector leverages GPT-4o&#39;s powerful code comprehension and generation capabilities to perform binary classification and identify vulnerabilities within target codebases. We conducted experiments on the Big-Vul dataset, showing that LProtector outperforms two state-of-the-art baselines in terms of F1 score, demonstrating the potential of integrating LLMs with vulnerability detection. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06493v2-abstract-full').style.display = 'none'; document.getElementById('2411.06493v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 4 figures. This is a preprint version of the article. The final version will be published in the proceedings of the IEEE conference</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06306">arXiv:2411.06306</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.06306">pdf</a>, <a href="https://arxiv.org/format/2411.06306">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Optimal Driver Warning Generation in Dynamic Driving Environment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+C">Chenran Li</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+A">Aolin Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Sachdeva%2C+E">Enna Sachdeva</a>, <a href="/search/cs?searchtype=author&amp;query=Misu%2C+T">Teruhisa Misu</a>, <a href="/search/cs?searchtype=author&amp;query=Dariush%2C+B">Behzad Dariush</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06306v1-abstract-short" style="display: inline;"> The driver warning system that alerts the human driver about potential risks during driving is a key feature of an advanced driver assistance system. Existing driver warning technologies, mainly the forward collision warning and unsafe lane change warning, can reduce the risk of collision caused by human errors. However, the current design methods have several major limitations. Firstly, the warni&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06306v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06306v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06306v1-abstract-full" style="display: none;"> The driver warning system that alerts the human driver about potential risks during driving is a key feature of an advanced driver assistance system. Existing driver warning technologies, mainly the forward collision warning and unsafe lane change warning, can reduce the risk of collision caused by human errors. However, the current design methods have several major limitations. Firstly, the warnings are mainly generated in a one-shot manner without modeling the ego driver&#39;s reactions and surrounding objects, which reduces the flexibility and generality of the system over different scenarios. Additionally, the triggering conditions of warning are mostly rule-based threshold-checking given the current state, which lacks the prediction of the potential risk in a sufficiently long future horizon. In this work, we study the problem of optimally generating driver warnings by considering the interactions among the generated warning, the driver behavior, and the states of ego and surrounding vehicles on a long horizon. The warning generation problem is formulated as a partially observed Markov decision process (POMDP). An optimal warning generation framework is proposed as a solution to the proposed POMDP. The simulation experiments demonstrate the superiority of the proposed solution to the existing warning generation methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06306v1-abstract-full').style.display = 'none'; document.getElementById('2411.06306v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICRA 2024</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Li%2C+C&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Li%2C+C&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Li%2C+C&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Li%2C+C&amp;start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Li%2C+C&amp;start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Li%2C+C&amp;start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">&hellip;</span></li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

Pages: 1 2 3 4 5 6 7 8 9 10