CINXE.COM
Search | arXiv e-print repository
<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 139 results for author: <span class="mathjax">Feng, B</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Feng%2C+B">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Feng, B"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Feng%2C+B&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Feng, B"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Feng%2C+B&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Feng%2C+B&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Feng%2C+B&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Feng%2C+B&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.13397">arXiv:2501.13397</a> <span> [<a href="https://arxiv.org/pdf/2501.13397">pdf</a>, <a href="https://arxiv.org/format/2501.13397">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> ExLM: Rethinking the Impact of [MASK] Tokens in Masked Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zheng%2C+K">Kangjie Zheng</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Junwei Yang</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+S">Siyue Liang</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+B">Bin Feng</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zequn Liu</a>, <a href="/search/cs?searchtype=author&query=Ju%2C+W">Wei Ju</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+Z">Zhiping Xiao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Ming Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.13397v4-abstract-short" style="display: inline;"> Masked Language Models (MLMs) have achieved remarkable success in many self-supervised representation learning tasks. MLMs are trained by randomly masking portions of the input sequences with [MASK] tokens and learning to reconstruct the original content based on the remaining context. This paper explores the impact of [MASK] tokens on MLMs. Analytical studies show that masking tokens can introduc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13397v4-abstract-full').style.display = 'inline'; document.getElementById('2501.13397v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.13397v4-abstract-full" style="display: none;"> Masked Language Models (MLMs) have achieved remarkable success in many self-supervised representation learning tasks. MLMs are trained by randomly masking portions of the input sequences with [MASK] tokens and learning to reconstruct the original content based on the remaining context. This paper explores the impact of [MASK] tokens on MLMs. Analytical studies show that masking tokens can introduce the corrupted semantics problem, wherein the corrupted context may convey multiple, ambiguous meanings. This problem is also a key factor affecting the performance of MLMs on downstream tasks. Based on these findings, we propose a novel enhanced-context MLM, ExLM. Our approach expands [MASK] tokens in the input context and models the dependencies between these expanded states. This enhancement increases context capacity and enables the model to capture richer semantic information, effectively mitigating the corrupted semantics problem during pre-training. Experimental results demonstrate that ExLM achieves significant performance improvements in both text modeling and SMILES modeling tasks. Further analysis confirms that ExLM enriches semantic representations through context enhancement, and effectively reduces the semantic multimodality commonly observed in MLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13397v4-abstract-full').style.display = 'none'; document.getElementById('2501.13397v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">30 pages, 12 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.12948">arXiv:2501.12948</a> <span> [<a href="https://arxiv.org/pdf/2501.12948">pdf</a>, <a href="https://arxiv.org/format/2501.12948">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=DeepSeek-AI"> DeepSeek-AI</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+D">Daya Guo</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+D">Dejian Yang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Haowei Zhang</a>, <a href="/search/cs?searchtype=author&query=Song%2C+J">Junxiao Song</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Ruoyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+R">Runxin Xu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Q">Qihao Zhu</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+S">Shirong Ma</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+P">Peiyi Wang</a>, <a href="/search/cs?searchtype=author&query=Bi%2C+X">Xiao Bi</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiaokang Zhang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+X">Xingkai Yu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yu Wu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Z+F">Z. F. Wu</a>, <a href="/search/cs?searchtype=author&query=Gou%2C+Z">Zhibin Gou</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+Z">Zhihong Shao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhuoshu Li</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Ziyi Gao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+A">Aixin Liu</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+B">Bing Xue</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bingxuan Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+B">Bochao Wu</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+B">Bei Feng</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+C">Chengda Lu</a> , et al. (175 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.12948v1-abstract-short" style="display: inline;"> We introduce our first-generation reasoning models, DeepSeek-R1-Zero and DeepSeek-R1. DeepSeek-R1-Zero, a model trained via large-scale reinforcement learning (RL) without supervised fine-tuning (SFT) as a preliminary step, demonstrates remarkable reasoning capabilities. Through RL, DeepSeek-R1-Zero naturally emerges with numerous powerful and intriguing reasoning behaviors. However, it encounters… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12948v1-abstract-full').style.display = 'inline'; document.getElementById('2501.12948v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.12948v1-abstract-full" style="display: none;"> We introduce our first-generation reasoning models, DeepSeek-R1-Zero and DeepSeek-R1. DeepSeek-R1-Zero, a model trained via large-scale reinforcement learning (RL) without supervised fine-tuning (SFT) as a preliminary step, demonstrates remarkable reasoning capabilities. Through RL, DeepSeek-R1-Zero naturally emerges with numerous powerful and intriguing reasoning behaviors. However, it encounters challenges such as poor readability, and language mixing. To address these issues and further enhance reasoning performance, we introduce DeepSeek-R1, which incorporates multi-stage training and cold-start data before RL. DeepSeek-R1 achieves performance comparable to OpenAI-o1-1217 on reasoning tasks. To support the research community, we open-source DeepSeek-R1-Zero, DeepSeek-R1, and six dense models (1.5B, 7B, 8B, 14B, 32B, 70B) distilled from DeepSeek-R1 based on Qwen and Llama. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12948v1-abstract-full').style.display = 'none'; document.getElementById('2501.12948v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.10694">arXiv:2501.10694</a> <span> [<a href="https://arxiv.org/pdf/2501.10694">pdf</a>, <a href="https://arxiv.org/ps/2501.10694">ps</a>, <a href="https://arxiv.org/format/2501.10694">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Energy Efficiency Maximization for Movable Antenna-Enhanced System Based on Statistical CSI </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xintai Chen</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+B">Biqian Feng</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yongpeng Wu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wenjun Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.10694v1-abstract-short" style="display: inline;"> This paper investigates an innovative movable antenna (MA)-enhanced multiple-input multiple-output (MIMO) system designed to enhance communication performance. We aim to maximize the energy efficiency (EE) under statistical channel state information (S-CSI) through a joint optimization of the transmit covariance matrix and the antenna position vectors (APVs). To solve the stochastic problem, we co… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.10694v1-abstract-full').style.display = 'inline'; document.getElementById('2501.10694v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.10694v1-abstract-full" style="display: none;"> This paper investigates an innovative movable antenna (MA)-enhanced multiple-input multiple-output (MIMO) system designed to enhance communication performance. We aim to maximize the energy efficiency (EE) under statistical channel state information (S-CSI) through a joint optimization of the transmit covariance matrix and the antenna position vectors (APVs). To solve the stochastic problem, we consider the large number of antennas scenario and resort to deterministic equivalent (DE) technology to reformulate the system EE w.r.t. the transmit variables, i.e., the transmit covariance matrix and APV, and the receive variables, i.e., the receive APV, respectively. Then, we propose an alternative optimization (AO) algorithm to update the transmit variables and the receive variables to maximize the system EE, respectively. Our numerical results reveal that, the proposed MA-enhanced system can significantly improve EE compared to several benchmark schemes and the optimal performance can be achieved with a finite size of movement regions for MAs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.10694v1-abstract-full').style.display = 'none'; document.getElementById('2501.10694v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICC, 6 pages, 2 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.06414">arXiv:2501.06414</a> <span> [<a href="https://arxiv.org/pdf/2501.06414">pdf</a>, <a href="https://arxiv.org/format/2501.06414">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> IPP-Net: A Generalizable Deep Neural Network Model for Indoor Pathloss Radio Map Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Feng%2C+B">Bin Feng</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+M">Meng Zheng</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+W">Wei Liang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Lei Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.06414v1-abstract-short" style="display: inline;"> In this paper, we propose a generalizable deep neural network model for indoor pathloss radio map prediction (termed as IPP-Net). IPP-Net is based on a UNet architecture and learned from both large-scale ray tracing simulation data and a modified 3GPP indoor hotspot model. The performance of IPP-Net is evaluated in the First Indoor Pathloss Radio Map Prediction Challenge in ICASSP 2025. The evalua… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06414v1-abstract-full').style.display = 'inline'; document.getElementById('2501.06414v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.06414v1-abstract-full" style="display: none;"> In this paper, we propose a generalizable deep neural network model for indoor pathloss radio map prediction (termed as IPP-Net). IPP-Net is based on a UNet architecture and learned from both large-scale ray tracing simulation data and a modified 3GPP indoor hotspot model. The performance of IPP-Net is evaluated in the First Indoor Pathloss Radio Map Prediction Challenge in ICASSP 2025. The evaluation results show that IPP-Net achieves a weighted root mean square error of 9.501 dB on three competition tasks and obtains the second overall ranking. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06414v1-abstract-full').style.display = 'none'; document.getElementById('2501.06414v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">2 pages, 1 figure, Accepted to ICASSP 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.02738">arXiv:2501.02738</a> <span> [<a href="https://arxiv.org/pdf/2501.02738">pdf</a>, <a href="https://arxiv.org/format/2501.02738">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> </div> </div> <p class="title is-5 mathjax"> SCSC: A Novel Standards-Compatible Semantic Communication Framework for Image Transmission </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Han%2C+X">Xue Han</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yongpeng Wu</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Zhen Gao</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+B">Biqian Feng</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yuxuan Shi</a>, <a href="/search/cs?searchtype=author&query=G%C3%BCnd%C3%BCz%2C+D">Deniz G眉nd眉z</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wenjun Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.02738v1-abstract-short" style="display: inline;"> Joint source-channel coding (JSCC) is a promising paradigm for next-generation communication systems, particularly in challenging transmission environments. In this paper, we propose a novel standard-compatible JSCC framework for the transmission of images over multiple-input multiple-output (MIMO) channels. Different from the existing end-to-end AI-based DeepJSCC schemes, our framework consists o… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02738v1-abstract-full').style.display = 'inline'; document.getElementById('2501.02738v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.02738v1-abstract-full" style="display: none;"> Joint source-channel coding (JSCC) is a promising paradigm for next-generation communication systems, particularly in challenging transmission environments. In this paper, we propose a novel standard-compatible JSCC framework for the transmission of images over multiple-input multiple-output (MIMO) channels. Different from the existing end-to-end AI-based DeepJSCC schemes, our framework consists of learnable modules that enable communication using conventional separate source and channel codes (SSCC), which makes it amenable for easy deployment on legacy systems. Specifically, the learnable modules involve a preprocessing-empowered network (PPEN) for preserving essential semantic information, and a precoder \& combiner-enhanced network (PCEN) for efficient transmission over a resource-constrained MIMO channel. We treat existing compression and channel coding modules as non-trainable blocks. Since the parameters of these modules are non-differentiable, we employ a proxy network that mimics their operations when training the learnable modules. Numerical results demonstrate that our scheme can save more than 29\% of the channel bandwidth, and requires lower complexity compared to the constrained baselines. We also show its generalization capability to unseen datasets and tasks through extensive experiments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02738v1-abstract-full').style.display = 'none'; document.getElementById('2501.02738v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by IEEE Transactions on Communications</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.01912">arXiv:2501.01912</a> <span> [<a href="https://arxiv.org/pdf/2501.01912">pdf</a>, <a href="https://arxiv.org/format/2501.01912">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Earth and Planetary Astrophysics">astro-ph.EP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Instrumentation and Methods for Astrophysics">astro-ph.IM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Exoplanet Detection via Differentiable Rendering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Feng%2C+B+Y">Brandon Y. Feng</a>, <a href="/search/cs?searchtype=author&query=Ferrer-Ch%C3%A1vez%2C+R">Rodrigo Ferrer-Ch谩vez</a>, <a href="/search/cs?searchtype=author&query=Levis%2C+A">Aviad Levis</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J+J">Jason J. Wang</a>, <a href="/search/cs?searchtype=author&query=Bouman%2C+K+L">Katherine L. Bouman</a>, <a href="/search/cs?searchtype=author&query=Freeman%2C+W+T">William T. Freeman</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.01912v1-abstract-short" style="display: inline;"> Direct imaging of exoplanets is crucial for advancing our understanding of planetary systems beyond our solar system, but it faces significant challenges due to the high contrast between host stars and their planets. Wavefront aberrations introduce speckles in the telescope science images, which are patterns of diffracted starlight that can mimic the appearance of planets, complicating the detecti… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.01912v1-abstract-full').style.display = 'inline'; document.getElementById('2501.01912v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.01912v1-abstract-full" style="display: none;"> Direct imaging of exoplanets is crucial for advancing our understanding of planetary systems beyond our solar system, but it faces significant challenges due to the high contrast between host stars and their planets. Wavefront aberrations introduce speckles in the telescope science images, which are patterns of diffracted starlight that can mimic the appearance of planets, complicating the detection of faint exoplanet signals. Traditional post-processing methods, operating primarily in the image intensity domain, do not integrate wavefront sensing data. These data, measured mainly for adaptive optics corrections, have been overlooked as a potential resource for post-processing, partly due to the challenge of the evolving nature of wavefront aberrations. In this paper, we present a differentiable rendering approach that leverages these wavefront sensing data to improve exoplanet detection. Our differentiable renderer models wave-based light propagation through a coronagraphic telescope system, allowing gradient-based optimization to significantly improve starlight subtraction and increase sensitivity to faint exoplanets. Simulation experiments based on the James Webb Space Telescope configuration demonstrate the effectiveness of our approach, achieving substantial improvements in contrast and planet detection limits. Our results showcase how the computational advancements enabled by differentiable rendering can revitalize previously underexploited wavefront data, opening new avenues for enhancing exoplanet imaging and characterization. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.01912v1-abstract-full').style.display = 'none'; document.getElementById('2501.01912v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Webpage: https://brandonyfeng.github.io/EDDO/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.19437">arXiv:2412.19437</a> <span> [<a href="https://arxiv.org/pdf/2412.19437">pdf</a>, <a href="https://arxiv.org/format/2412.19437">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> DeepSeek-V3 Technical Report </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=DeepSeek-AI"> DeepSeek-AI</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+A">Aixin Liu</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+B">Bei Feng</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+B">Bing Xue</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bingxuan Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+B">Bochao Wu</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+C">Chengda Lu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+C">Chenggang Zhao</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+C">Chengqi Deng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chenyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Ruan%2C+C">Chong Ruan</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+D">Damai Dai</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+D">Daya Guo</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+D">Dejian Yang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+D">Deli Chen</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+D">Dongjie Ji</a>, <a href="/search/cs?searchtype=author&query=Li%2C+E">Erhang Li</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+F">Fangyun Lin</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+F">Fucong Dai</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+F">Fuli Luo</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+G">Guangbo Hao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+G">Guanting Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Guowei Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">H. Zhang</a>, <a href="/search/cs?searchtype=author&query=Bao%2C+H">Han Bao</a> , et al. (175 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.19437v2-abstract-short" style="display: inline;"> We present DeepSeek-V3, a strong Mixture-of-Experts (MoE) language model with 671B total parameters with 37B activated for each token. To achieve efficient inference and cost-effective training, DeepSeek-V3 adopts Multi-head Latent Attention (MLA) and DeepSeekMoE architectures, which were thoroughly validated in DeepSeek-V2. Furthermore, DeepSeek-V3 pioneers an auxiliary-loss-free strategy for loa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.19437v2-abstract-full').style.display = 'inline'; document.getElementById('2412.19437v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.19437v2-abstract-full" style="display: none;"> We present DeepSeek-V3, a strong Mixture-of-Experts (MoE) language model with 671B total parameters with 37B activated for each token. To achieve efficient inference and cost-effective training, DeepSeek-V3 adopts Multi-head Latent Attention (MLA) and DeepSeekMoE architectures, which were thoroughly validated in DeepSeek-V2. Furthermore, DeepSeek-V3 pioneers an auxiliary-loss-free strategy for load balancing and sets a multi-token prediction training objective for stronger performance. We pre-train DeepSeek-V3 on 14.8 trillion diverse and high-quality tokens, followed by Supervised Fine-Tuning and Reinforcement Learning stages to fully harness its capabilities. Comprehensive evaluations reveal that DeepSeek-V3 outperforms other open-source models and achieves performance comparable to leading closed-source models. Despite its excellent performance, DeepSeek-V3 requires only 2.788M H800 GPU hours for its full training. In addition, its training process is remarkably stable. Throughout the entire training process, we did not experience any irrecoverable loss spikes or perform any rollbacks. The model checkpoints are available at https://github.com/deepseek-ai/DeepSeek-V3. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.19437v2-abstract-full').style.display = 'none'; document.getElementById('2412.19437v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.07761">arXiv:2412.07761</a> <span> [<a href="https://arxiv.org/pdf/2412.07761">pdf</a>, <a href="https://arxiv.org/format/2412.07761">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Repurposing Pre-trained Video Diffusion Models for Event-based Video Interpolation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jingxi Chen</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+B+Y">Brandon Y. Feng</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+H">Haoming Cai</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+T">Tianfu Wang</a>, <a href="/search/cs?searchtype=author&query=Burner%2C+L">Levi Burner</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+D">Dehao Yuan</a>, <a href="/search/cs?searchtype=author&query=Fermuller%2C+C">Cornelia Fermuller</a>, <a href="/search/cs?searchtype=author&query=Metzler%2C+C+A">Christopher A. Metzler</a>, <a href="/search/cs?searchtype=author&query=Aloimonos%2C+Y">Yiannis Aloimonos</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.07761v1-abstract-short" style="display: inline;"> Video Frame Interpolation aims to recover realistic missing frames between observed frames, generating a high-frame-rate video from a low-frame-rate video. However, without additional guidance, the large motion between frames makes this problem ill-posed. Event-based Video Frame Interpolation (EVFI) addresses this challenge by using sparse, high-temporal-resolution event measurements as motion gui… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.07761v1-abstract-full').style.display = 'inline'; document.getElementById('2412.07761v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.07761v1-abstract-full" style="display: none;"> Video Frame Interpolation aims to recover realistic missing frames between observed frames, generating a high-frame-rate video from a low-frame-rate video. However, without additional guidance, the large motion between frames makes this problem ill-posed. Event-based Video Frame Interpolation (EVFI) addresses this challenge by using sparse, high-temporal-resolution event measurements as motion guidance. This guidance allows EVFI methods to significantly outperform frame-only methods. However, to date, EVFI methods have relied on a limited set of paired event-frame training data, severely limiting their performance and generalization capabilities. In this work, we overcome the limited data challenge by adapting pre-trained video diffusion models trained on internet-scale datasets to EVFI. We experimentally validate our approach on real-world EVFI datasets, including a new one that we introduce. Our method outperforms existing methods and generalizes across cameras far better than existing approaches. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.07761v1-abstract-full').style.display = 'none'; document.getElementById('2412.07761v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.05569">arXiv:2412.05569</a> <span> [<a href="https://arxiv.org/pdf/2412.05569">pdf</a>, <a href="https://arxiv.org/format/2412.05569">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Biomolecules">q-bio.BM</span> </div> </div> <p class="title is-5 mathjax"> SMI-Editor: Edit-based SMILES Language Model with Fragment-level Supervision </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zheng%2C+K">Kangjie Zheng</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+S">Siyue Liang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Junwei Yang</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+B">Bin Feng</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zequn Liu</a>, <a href="/search/cs?searchtype=author&query=Ju%2C+W">Wei Ju</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+Z">Zhiping Xiao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Ming Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.05569v1-abstract-short" style="display: inline;"> SMILES, a crucial textual representation of molecular structures, has garnered significant attention as a foundation for pre-trained language models (LMs). However, most existing pre-trained SMILES LMs focus solely on the single-token level supervision during pre-training, failing to fully leverage the substructural information of molecules. This limitation makes the pre-training task overly simpl… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.05569v1-abstract-full').style.display = 'inline'; document.getElementById('2412.05569v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.05569v1-abstract-full" style="display: none;"> SMILES, a crucial textual representation of molecular structures, has garnered significant attention as a foundation for pre-trained language models (LMs). However, most existing pre-trained SMILES LMs focus solely on the single-token level supervision during pre-training, failing to fully leverage the substructural information of molecules. This limitation makes the pre-training task overly simplistic, preventing the models from capturing richer molecular semantic information. Moreover, during pre-training, these SMILES LMs only process corrupted SMILES inputs, never encountering any valid SMILES, which leads to a train-inference mismatch. To address these challenges, we propose SMI-Editor, a novel edit-based pre-trained SMILES LM. SMI-Editor disrupts substructures within a molecule at random and feeds the resulting SMILES back into the model, which then attempts to restore the original SMILES through an editing process. This approach not only introduces fragment-level training signals, but also enables the use of valid SMILES as inputs, allowing the model to learn how to reconstruct complete molecules from these incomplete structures. As a result, the model demonstrates improved scalability and an enhanced ability to capture fragment-level molecular information. Experimental results show that SMI-Editor achieves state-of-the-art performance across multiple downstream molecular tasks, and even outperforming several 3D molecular representation models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.05569v1-abstract-full').style.display = 'none'; document.getElementById('2412.05569v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.05496">arXiv:2412.05496</a> <span> [<a href="https://arxiv.org/pdf/2412.05496">pdf</a>, <a href="https://arxiv.org/format/2412.05496">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Performance">cs.PF</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Programming Languages">cs.PL</span> </div> </div> <p class="title is-5 mathjax"> Flex Attention: A Programming Model for Generating Optimized Attention Kernels </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dong%2C+J">Juechu Dong</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+B">Boyuan Feng</a>, <a href="/search/cs?searchtype=author&query=Guessous%2C+D">Driss Guessous</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+Y">Yanbo Liang</a>, <a href="/search/cs?searchtype=author&query=He%2C+H">Horace He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.05496v1-abstract-short" style="display: inline;"> Over the past 7 years, attention has become one of the most important primitives in deep learning. The primary approach to optimize attention is FlashAttention, which fuses the operation together, drastically improving both the runtime and the memory consumption. However, the importance of FlashAttention combined with its monolithic nature poses a problem for researchers aiming to try new attentio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.05496v1-abstract-full').style.display = 'inline'; document.getElementById('2412.05496v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.05496v1-abstract-full" style="display: none;"> Over the past 7 years, attention has become one of the most important primitives in deep learning. The primary approach to optimize attention is FlashAttention, which fuses the operation together, drastically improving both the runtime and the memory consumption. However, the importance of FlashAttention combined with its monolithic nature poses a problem for researchers aiming to try new attention variants -- a "software lottery". This problem is exacerbated by the difficulty of writing efficient fused attention kernels, resisting traditional compiler-based approaches. We introduce FlexAttention, a novel compiler-driven programming model that allows implementing the majority of attention variants in a few lines of idiomatic PyTorch code. We demonstrate that many existing attention variants (e.g. Alibi, Document Masking, PagedAttention, etc.) can be implemented via FlexAttention, and that we achieve competitive performance compared to these handwritten kernels. Finally, we demonstrate how FlexAttention allows for easy composition of attention variants, solving the combinatorial explosion of attention variants. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.05496v1-abstract-full').style.display = 'none'; document.getElementById('2412.05496v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07742">arXiv:2411.07742</a> <span> [<a href="https://arxiv.org/pdf/2411.07742">pdf</a>, <a href="https://arxiv.org/format/2411.07742">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Efficient 3D Perception on Multi-Sweep Point Cloud with Gumbel Spatial Pruning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+T">Tianyu Sun</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jianhao Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xueqian Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhongdao Wang</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+B">Bailan Feng</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Hengshuang Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07742v3-abstract-short" style="display: inline;"> This paper studies point cloud perception within outdoor environments. Existing methods face limitations in recognizing objects located at a distance or occluded, due to the sparse nature of outdoor point clouds. In this work, we observe a significant mitigation of this problem by accumulating multiple temporally consecutive point cloud sweeps, resulting in a remarkable improvement in perception a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07742v3-abstract-full').style.display = 'inline'; document.getElementById('2411.07742v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07742v3-abstract-full" style="display: none;"> This paper studies point cloud perception within outdoor environments. Existing methods face limitations in recognizing objects located at a distance or occluded, due to the sparse nature of outdoor point clouds. In this work, we observe a significant mitigation of this problem by accumulating multiple temporally consecutive point cloud sweeps, resulting in a remarkable improvement in perception accuracy. However, the computation cost also increases, hindering previous approaches from utilizing a large number of point cloud sweeps. To tackle this challenge, we find that a considerable portion of points in the accumulated point cloud is redundant, and discarding these points has minimal impact on perception accuracy. We introduce a simple yet effective Gumbel Spatial Pruning (GSP) layer that dynamically prunes points based on a learned end-to-end sampling. The GSP layer is decoupled from other network components and thus can be seamlessly integrated into existing point cloud network architectures. Without incurring additional computational overhead, we increase the number of point cloud sweeps from 10, a common practice, to as many as 40. Consequently, there is a significant enhancement in perception performance. For instance, in nuScenes 3D object detection and BEV map segmentation tasks, our pruning strategy improves several 3D perception baseline methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07742v3-abstract-full').style.display = 'none'; document.getElementById('2411.07742v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06324">arXiv:2411.06324</a> <span> [<a href="https://arxiv.org/pdf/2411.06324">pdf</a>, <a href="https://arxiv.org/format/2411.06324">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Methodology">stat.ME</span> </div> </div> <p class="title is-5 mathjax"> Amortized Bayesian Local Interpolation NetworK: Fast covariance parameter estimation for Gaussian Processes </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Feng%2C+B+R">Brandon R. Feng</a>, <a href="/search/cs?searchtype=author&query=Majumder%2C+R">Reetam Majumder</a>, <a href="/search/cs?searchtype=author&query=Reich%2C+B+J">Brian J. Reich</a>, <a href="/search/cs?searchtype=author&query=Abba%2C+M+A">Mohamed A. Abba</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06324v1-abstract-short" style="display: inline;"> Gaussian processes (GPs) are a ubiquitous tool for geostatistical modeling with high levels of flexibility and interpretability, and the ability to make predictions at unseen spatial locations through a process called Kriging. Estimation of Kriging weights relies on the inversion of the process' covariance matrix, creating a computational bottleneck for large spatial datasets. In this paper, we pr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06324v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06324v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06324v1-abstract-full" style="display: none;"> Gaussian processes (GPs) are a ubiquitous tool for geostatistical modeling with high levels of flexibility and interpretability, and the ability to make predictions at unseen spatial locations through a process called Kriging. Estimation of Kriging weights relies on the inversion of the process' covariance matrix, creating a computational bottleneck for large spatial datasets. In this paper, we propose an Amortized Bayesian Local Interpolation NetworK (A-BLINK) for fast covariance parameter estimation, which uses two pre-trained deep neural networks to learn a mapping from spatial location coordinates and covariance function parameters to Kriging weights and the spatial variance, respectively. The fast prediction time of these networks allows us to bypass the matrix inversion step, creating large computational speedups over competing methods in both frequentist and Bayesian settings, and also provides full posterior inference and predictions using Markov chain Monte Carlo sampling methods. We show significant increases in computational efficiency over comparable scalable GP methodology in an extensive simulation study with lower parameter estimation error. The efficacy of our approach is also demonstrated using a temperature dataset of US climate normals for 1991--2020 based on over 7,000 weather stations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06324v1-abstract-full').style.display = 'none'; document.getElementById('2411.06324v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.17922">arXiv:2410.17922</a> <span> [<a href="https://arxiv.org/pdf/2410.17922">pdf</a>, <a href="https://arxiv.org/format/2410.17922">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Dynamic Guided and Domain Applicable Safeguards for Enhanced Security in Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Luo%2C+W">Weidi Luo</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+H">He Cao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zijing Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yu Wang</a>, <a href="/search/cs?searchtype=author&query=Wong%2C+A">Aidan Wong</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+B">Bing Feng</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+Y">Yuan Yao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yu Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.17922v2-abstract-short" style="display: inline;"> With the extensive deployment of Large Language Models (LLMs), ensuring their safety has become increasingly critical. However, existing defense methods often struggle with two key issues: (i) inadequate defense capabilities, particularly in domain-specific scenarios like chemistry, where a lack of specialized knowledge can lead to the generation of harmful responses to malicious queries. (ii) ove… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17922v2-abstract-full').style.display = 'inline'; document.getElementById('2410.17922v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.17922v2-abstract-full" style="display: none;"> With the extensive deployment of Large Language Models (LLMs), ensuring their safety has become increasingly critical. However, existing defense methods often struggle with two key issues: (i) inadequate defense capabilities, particularly in domain-specific scenarios like chemistry, where a lack of specialized knowledge can lead to the generation of harmful responses to malicious queries. (ii) over-defensiveness, which compromises the general utility and responsiveness of LLMs. To mitigate these issues, we introduce a multi-agents-based defense framework, Guide for Defense (G4D), which leverages accurate external information to provide an unbiased summary of user intentions and analytically grounded safety response guidance. Extensive experiments on popular jailbreak attacks and benign datasets show that our G4D can enhance LLM's robustness against jailbreak attacks on general and domain-specific scenarios without compromising the model's general functionality. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17922v2-abstract-full').style.display = 'none'; document.getElementById('2410.17922v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.05787">arXiv:2410.05787</a> <span> [<a href="https://arxiv.org/pdf/2410.05787">pdf</a>, <a href="https://arxiv.org/format/2410.05787">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> </div> </div> <p class="title is-5 mathjax"> An accelerate Prediction Strategy for Dynamic Multi-Objective Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lei%2C+R">Ru Lei</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Lin Li</a>, <a href="/search/cs?searchtype=author&query=Stolkin%2C+R">Rustam Stolkin</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+B">Bin Feng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.05787v2-abstract-short" style="display: inline;"> This paper addresses the challenge of dynamic multi-objective optimization problems (DMOPs) by introducing novel approaches for accelerating prediction strategies within the evolutionary algorithm framework. Since the objectives of DMOPs evolve over time, both the Pareto optimal set (PS) and the Pareto optimal front (PF) are dynamic. To effectively track the changes in the PS and PF in both decisi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05787v2-abstract-full').style.display = 'inline'; document.getElementById('2410.05787v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.05787v2-abstract-full" style="display: none;"> This paper addresses the challenge of dynamic multi-objective optimization problems (DMOPs) by introducing novel approaches for accelerating prediction strategies within the evolutionary algorithm framework. Since the objectives of DMOPs evolve over time, both the Pareto optimal set (PS) and the Pareto optimal front (PF) are dynamic. To effectively track the changes in the PS and PF in both decision and objective spaces, we propose an adaptive prediction strategy that incorporates second-order derivatives to predict and adjust the algorithms search behavior. This strategy enhances the algorithm's ability to anticipate changes in the environment, allowing for more efficient population re-initialization. We evaluate the performance of the proposed method against four state-of-the-art algorithms using standard DMOPs benchmark problems. Experimental results demonstrate that the proposed approach significantly outperforms the other algorithms across most test problems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05787v2-abstract-full').style.display = 'none'; document.getElementById('2410.05787v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Dynamic Multi-objective Optimization Problems</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.02764">arXiv:2410.02764</a> <span> [<a href="https://arxiv.org/pdf/2410.02764">pdf</a>, <a href="https://arxiv.org/format/2410.02764">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Flash-Splat: 3D Reflection Removal with Flash Cues and Gaussian Splats </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xie%2C+M">Mingyang Xie</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+H">Haoming Cai</a>, <a href="/search/cs?searchtype=author&query=Shah%2C+S">Sachin Shah</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yiran Xu</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+B+Y">Brandon Y. Feng</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jia-Bin Huang</a>, <a href="/search/cs?searchtype=author&query=Metzler%2C+C+A">Christopher A. Metzler</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.02764v1-abstract-short" style="display: inline;"> We introduce a simple yet effective approach for separating transmitted and reflected light. Our key insight is that the powerful novel view synthesis capabilities provided by modern inverse rendering methods (e.g.,~3D Gaussian splatting) allow one to perform flash/no-flash reflection separation using unpaired measurements -- this relaxation dramatically simplifies image acquisition over conventio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02764v1-abstract-full').style.display = 'inline'; document.getElementById('2410.02764v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.02764v1-abstract-full" style="display: none;"> We introduce a simple yet effective approach for separating transmitted and reflected light. Our key insight is that the powerful novel view synthesis capabilities provided by modern inverse rendering methods (e.g.,~3D Gaussian splatting) allow one to perform flash/no-flash reflection separation using unpaired measurements -- this relaxation dramatically simplifies image acquisition over conventional paired flash/no-flash reflection separation methods. Through extensive real-world experiments, we demonstrate our method, Flash-Splat, accurately reconstructs both transmitted and reflected scenes in 3D. Our method outperforms existing 3D reflection separation methods, which do not leverage illumination control, by a large margin. Our project webpage is at https://flash-splat.github.io/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02764v1-abstract-full').style.display = 'none'; document.getElementById('2410.02764v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.18026">arXiv:2409.18026</a> <span> [<a href="https://arxiv.org/pdf/2409.18026">pdf</a>, <a href="https://arxiv.org/format/2409.18026">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> ReliOcc: Towards Reliable Semantic Occupancy Prediction via Uncertainty Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+S">Song Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhongdao Wang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+J">Jiawei Yu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wentong Li</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+B">Bailan Feng</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Junbo Chen</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jianke Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.18026v1-abstract-short" style="display: inline;"> Vision-centric semantic occupancy prediction plays a crucial role in autonomous driving, which requires accurate and reliable predictions from low-cost sensors. Although having notably narrowed the accuracy gap with LiDAR, there is still few research effort to explore the reliability in predicting semantic occupancy from camera. In this paper, we conduct a comprehensive evaluation of existing sema… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.18026v1-abstract-full').style.display = 'inline'; document.getElementById('2409.18026v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.18026v1-abstract-full" style="display: none;"> Vision-centric semantic occupancy prediction plays a crucial role in autonomous driving, which requires accurate and reliable predictions from low-cost sensors. Although having notably narrowed the accuracy gap with LiDAR, there is still few research effort to explore the reliability in predicting semantic occupancy from camera. In this paper, we conduct a comprehensive evaluation of existing semantic occupancy prediction models from a reliability perspective for the first time. Despite the gradual alignment of camera-based models with LiDAR in term of accuracy, a significant reliability gap persists. To addresses this concern, we propose ReliOcc, a method designed to enhance the reliability of camera-based occupancy networks. ReliOcc provides a plug-and-play scheme for existing models, which integrates hybrid uncertainty from individual voxels with sampling-based noise and relative voxels through mix-up learning. Besides, an uncertainty-aware calibration strategy is devised to further enhance model reliability in offline mode. Extensive experiments under various settings demonstrate that ReliOcc significantly enhances model reliability while maintaining the accuracy of both geometric and semantic predictions. Importantly, our proposed approach exhibits robustness to sensor failures and out of domain noises during inference. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.18026v1-abstract-full').style.display = 'none'; document.getElementById('2409.18026v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Technical report. Work in progress</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.14072">arXiv:2409.14072</a> <span> [<a href="https://arxiv.org/pdf/2409.14072">pdf</a>, <a href="https://arxiv.org/format/2409.14072">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Dynamic 2D Gaussians: Geometrically accurate radiance fields for dynamic objects </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shuai Zhang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+G">Guanjun Wu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xinggang Wang</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+B">Bin Feng</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+W">Wenyu Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.14072v1-abstract-short" style="display: inline;"> Reconstructing objects and extracting high-quality surfaces play a vital role in the real world. Current 4D representations show the ability to render high-quality novel views for dynamic objects but cannot reconstruct high-quality meshes due to their implicit or geometrically inaccurate representations. In this paper, we propose a novel representation that can reconstruct accurate meshes from spa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.14072v1-abstract-full').style.display = 'inline'; document.getElementById('2409.14072v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.14072v1-abstract-full" style="display: none;"> Reconstructing objects and extracting high-quality surfaces play a vital role in the real world. Current 4D representations show the ability to render high-quality novel views for dynamic objects but cannot reconstruct high-quality meshes due to their implicit or geometrically inaccurate representations. In this paper, we propose a novel representation that can reconstruct accurate meshes from sparse image input, named Dynamic 2D Gaussians (D-2DGS). We adopt 2D Gaussians for basic geometry representation and use sparse-controlled points to capture 2D Gaussian's deformation. By extracting the object mask from the rendered high-quality image and masking the rendered depth map, a high-quality dynamic mesh sequence of the object can be extracted. Experiments demonstrate that our D-2DGS is outstanding in reconstructing high-quality meshes from sparse input. More demos and code are available at https://github.com/hustvl/Dynamic-2DGS. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.14072v1-abstract-full').style.display = 'none'; document.getElementById('2409.14072v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.07762">arXiv:2409.07762</a> <span> [<a href="https://arxiv.org/pdf/2409.07762">pdf</a>, <a href="https://arxiv.org/ps/2409.07762">ps</a>, <a href="https://arxiv.org/format/2409.07762">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Exploring Kolmogorov-Arnold networks for realistic image sharpness assessment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+S">Shaode Yu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Ze Chen</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhimu Yang</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+J">Jiacheng Gu</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+B">Bizu Feng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.07762v3-abstract-short" style="display: inline;"> Score prediction is crucial in evaluating realistic image sharpness based on collected informative features. Recently, Kolmogorov-Arnold networks (KANs) have been developed and witnessed remarkable success in data fitting. This study introduces the Taylor series-based KAN (TaylorKAN). Then, different KANs are explored in four realistic image databases (BID2011, CID2013, CLIVE, and KonIQ-10k) to pr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.07762v3-abstract-full').style.display = 'inline'; document.getElementById('2409.07762v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.07762v3-abstract-full" style="display: none;"> Score prediction is crucial in evaluating realistic image sharpness based on collected informative features. Recently, Kolmogorov-Arnold networks (KANs) have been developed and witnessed remarkable success in data fitting. This study introduces the Taylor series-based KAN (TaylorKAN). Then, different KANs are explored in four realistic image databases (BID2011, CID2013, CLIVE, and KonIQ-10k) to predict the scores by using 15 mid-level features and 2048 high-level features. Compared to support vector regression, results show that KANs are generally competitive or superior, and TaylorKAN is the best one when mid-level features are used. This is the first study to investigate KANs on image quality assessment that sheds some light on how to select and further improve KANs in related tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.07762v3-abstract-full').style.display = 'none'; document.getElementById('2409.07762v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.12519">arXiv:2407.12519</a> <span> [<a href="https://arxiv.org/pdf/2407.12519">pdf</a>, <a href="https://arxiv.org/format/2407.12519">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Causality-inspired Discriminative Feature Learning in Triple Domains for Gait Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiong%2C+H">Haijun Xiong</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+B">Bin Feng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xinggang Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+W">Wenyu Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.12519v1-abstract-short" style="display: inline;"> Gait recognition is a biometric technology that distinguishes individuals by their walking patterns. However, previous methods face challenges when accurately extracting identity features because they often become entangled with non-identity clues. To address this challenge, we propose CLTD, a causality-inspired discriminative feature learning module designed to effectively eliminate the influence… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.12519v1-abstract-full').style.display = 'inline'; document.getElementById('2407.12519v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.12519v1-abstract-full" style="display: none;"> Gait recognition is a biometric technology that distinguishes individuals by their walking patterns. However, previous methods face challenges when accurately extracting identity features because they often become entangled with non-identity clues. To address this challenge, we propose CLTD, a causality-inspired discriminative feature learning module designed to effectively eliminate the influence of confounders in triple domains, \ie, spatial, temporal, and spectral. Specifically, we utilize the Cross Pixel-wise Attention Generator (CPAG) to generate attention distributions for factual and counterfactual features in spatial and temporal domains. Then, we introduce the Fourier Projection Head (FPH) to project spatial features into the spectral space, which preserves essential information while reducing computational costs. Additionally, we employ an optimization method with contrastive learning to enforce semantic consistency constraints across sequences from the same subject. Our approach has demonstrated significant performance improvements on challenging datasets, proving its effectiveness. Moreover, it can be seamlessly integrated into existing gait recognition methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.12519v1-abstract-full').style.display = 'none'; document.getElementById('2407.12519v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ECCV 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.12294">arXiv:2407.12294</a> <span> [<a href="https://arxiv.org/pdf/2407.12294">pdf</a>, <a href="https://arxiv.org/format/2407.12294">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> VEON: Vocabulary-Enhanced Occupancy Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zheng%2C+J">Jilai Zheng</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+P">Pin Tang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhongdao Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guoqing Wang</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+X">Xiangxuan Ren</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+B">Bailan Feng</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+C">Chao Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.12294v1-abstract-short" style="display: inline;"> Perceiving the world as 3D occupancy supports embodied agents to avoid collision with any types of obstacle. While open-vocabulary image understanding has prospered recently, how to bind the predicted 3D occupancy grids with open-world semantics still remains under-explored due to limited open-world annotations. Hence, instead of building our model from scratch, we try to blend 2D foundation model… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.12294v1-abstract-full').style.display = 'inline'; document.getElementById('2407.12294v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.12294v1-abstract-full" style="display: none;"> Perceiving the world as 3D occupancy supports embodied agents to avoid collision with any types of obstacle. While open-vocabulary image understanding has prospered recently, how to bind the predicted 3D occupancy grids with open-world semantics still remains under-explored due to limited open-world annotations. Hence, instead of building our model from scratch, we try to blend 2D foundation models, specifically a depth model MiDaS and a semantic model CLIP, to lift the semantics to 3D space, thus fulfilling 3D occupancy. However, building upon these foundation models is not trivial. First, the MiDaS faces the depth ambiguity problem, i.e., it only produces relative depth but fails to estimate bin depth for feature lifting. Second, the CLIP image features lack high-resolution pixel-level information, which limits the 3D occupancy accuracy. Third, open vocabulary is often trapped by the long-tail problem. To address these issues, we propose VEON for Vocabulary-Enhanced Occupancy predictioN by not only assembling but also adapting these foundation models. We first equip MiDaS with a Zoedepth head and low-rank adaptation (LoRA) for relative-metric-bin depth transformation while reserving beneficial depth prior. Then, a lightweight side adaptor network is attached to the CLIP vision encoder to generate high-resolution features for fine-grained 3D occupancy prediction. Moreover, we design a class reweighting strategy to give priority to the tail classes. With only 46M trainable parameters and zero manual semantic labels, VEON achieves 15.14 mIoU on Occ3D-nuScenes, and shows the capability of recognizing objects with open-vocabulary categories, meaning that our VEON is label-efficient, parameter-efficient, and precise enough. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.12294v1-abstract-full').style.display = 'none'; document.getElementById('2407.12294v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ECCV2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.11382">arXiv:2407.11382</a> <span> [<a href="https://arxiv.org/pdf/2407.11382">pdf</a>, <a href="https://arxiv.org/format/2407.11382">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Segment, Lift and Fit: Automatic 3D Shape Labeling from 2D Prompts </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+J">Jianhao Li</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+T">Tianyu Sun</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhongdao Wang</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+E">Enze Xie</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+B">Bailan Feng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hongbo Zhang</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Z">Ze Yuan</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+K">Ke Xu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jiaheng Liu</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+P">Ping Luo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.11382v2-abstract-short" style="display: inline;"> This paper proposes an algorithm for automatically labeling 3D objects from 2D point or box prompts, especially focusing on applications in autonomous driving. Unlike previous arts, our auto-labeler predicts 3D shapes instead of bounding boxes and does not require training on a specific dataset. We propose a Segment, Lift, and Fit (SLF) paradigm to achieve this goal. Firstly, we segment high-quali… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.11382v2-abstract-full').style.display = 'inline'; document.getElementById('2407.11382v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.11382v2-abstract-full" style="display: none;"> This paper proposes an algorithm for automatically labeling 3D objects from 2D point or box prompts, especially focusing on applications in autonomous driving. Unlike previous arts, our auto-labeler predicts 3D shapes instead of bounding boxes and does not require training on a specific dataset. We propose a Segment, Lift, and Fit (SLF) paradigm to achieve this goal. Firstly, we segment high-quality instance masks from the prompts using the Segment Anything Model (SAM) and transform the remaining problem into predicting 3D shapes from given 2D masks. Due to the ill-posed nature of this problem, it presents a significant challenge as multiple 3D shapes can project into an identical mask. To tackle this issue, we then lift 2D masks to 3D forms and employ gradient descent to adjust their poses and shapes until the projections fit the masks and the surfaces conform to surrounding LiDAR points. Notably, since we do not train on a specific dataset, the SLF auto-labeler does not overfit to biased annotation patterns in the training set as other methods do. Thus, the generalization ability across different datasets improves. Experimental results on the KITTI dataset demonstrate that the SLF auto-labeler produces high-quality bounding box annotations, achieving an AP@0.5 IoU of nearly 90\%. Detectors trained with the generated pseudo-labels perform nearly as well as those trained with actual ground-truth annotations. Furthermore, the SLF auto-labeler shows promising results in detailed shape predictions, providing a potential alternative for the occupancy annotation of dynamic objects. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.11382v2-abstract-full').style.display = 'none'; document.getElementById('2407.11382v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ECCV 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.01029">arXiv:2407.01029</a> <span> [<a href="https://arxiv.org/pdf/2407.01029">pdf</a>, <a href="https://arxiv.org/format/2407.01029">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> EndoSparse: Real-Time Sparse View Synthesis of Endoscopic Scenes using Gaussian Splatting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+C">Chenxin Li</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+B+Y">Brandon Y. Feng</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yifan Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Hengyu Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Cheng Wang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+W">Weihao Yu</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Yixuan Yuan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.01029v1-abstract-short" style="display: inline;"> 3D reconstruction of biological tissues from a collection of endoscopic images is a key to unlock various important downstream surgical applications with 3D capabilities. Existing methods employ various advanced neural rendering techniques for photorealistic view synthesis, but they often struggle to recover accurate 3D representations when only sparse observations are available, which is usually… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.01029v1-abstract-full').style.display = 'inline'; document.getElementById('2407.01029v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.01029v1-abstract-full" style="display: none;"> 3D reconstruction of biological tissues from a collection of endoscopic images is a key to unlock various important downstream surgical applications with 3D capabilities. Existing methods employ various advanced neural rendering techniques for photorealistic view synthesis, but they often struggle to recover accurate 3D representations when only sparse observations are available, which is usually the case in real-world clinical scenarios. To tackle this {sparsity} challenge, we propose a framework leveraging the prior knowledge from multiple foundation models during the reconstruction process, dubbed as \textit{EndoSparse}. Experimental results indicate that our proposed strategy significantly improves the geometric and appearance quality under challenging sparse-view conditions, including using only three views. In rigorous benchmarking experiments against state-of-the-art methods, \textit{EndoSparse} achieves superior results in terms of accurate geometry, realistic appearance, and rendering efficiency, confirming the robustness to sparse-view limitations in endoscopic reconstruction. \textit{EndoSparse} signifies a steady step towards the practical deployment of neural 3D reconstruction in real-world clinical scenarios. Project page: https://endo-sparse.github.io/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.01029v1-abstract-full').style.display = 'none'; document.getElementById('2407.01029v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accpeted by MICCAI2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.14746">arXiv:2406.14746</a> <span> [<a href="https://arxiv.org/pdf/2406.14746">pdf</a>, <a href="https://arxiv.org/format/2406.14746">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Behavior-Inspired Neural Networks for Relational Inference </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yulong Yang</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+B">Bowen Feng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+K">Keqin Wang</a>, <a href="/search/cs?searchtype=author&query=Leonard%2C+N">Naomi Leonard</a>, <a href="/search/cs?searchtype=author&query=Dieng%2C+A+B">Adji Bousso Dieng</a>, <a href="/search/cs?searchtype=author&query=Allen-Blanchette%2C+C">Christine Allen-Blanchette</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.14746v2-abstract-short" style="display: inline;"> From pedestrians to Kuramoto oscillators, interactions between agents govern how a multitude of dynamical systems evolve in space and time. Discovering how these agents relate to each other can improve our understanding of the often complex dynamics that underlie these systems. Recent works learn to categorize relationships between agents based on observations of their physical behavior. These app… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.14746v2-abstract-full').style.display = 'inline'; document.getElementById('2406.14746v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.14746v2-abstract-full" style="display: none;"> From pedestrians to Kuramoto oscillators, interactions between agents govern how a multitude of dynamical systems evolve in space and time. Discovering how these agents relate to each other can improve our understanding of the often complex dynamics that underlie these systems. Recent works learn to categorize relationships between agents based on observations of their physical behavior. These approaches are limited in that the relationship categories are modelled as outcomes of categorical distribution, when in real world systems categories often intermingle and interact. In this work, we introduce a level of abstraction between the observable behavior of agents and the latent categories that determine their behavior. To do this, we learn a mapping from agent behavior to agent preferences for each latent category in a graph neural network. We integrate the physical proximity of agents and their preferences in a nonlinear opinion dynamics model which provides a mechanism to identify mutually exclusive latent categories, predict an agent's evolution in time, and control an agent's physical behavior. We demonstrate the utility of our model for learning interpretable categories, and its efficacy on long-horizon prediction across several benchmarks where we outperform existing methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.14746v2-abstract-full').style.display = 'none'; document.getElementById('2406.14746v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.12816">arXiv:2406.12816</a> <span> [<a href="https://arxiv.org/pdf/2406.12816">pdf</a>, <a href="https://arxiv.org/format/2406.12816">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Neural Approximate Mirror Maps for Constrained Diffusion Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Feng%2C+B+T">Berthy T. Feng</a>, <a href="/search/cs?searchtype=author&query=Baptista%2C+R">Ricardo Baptista</a>, <a href="/search/cs?searchtype=author&query=Bouman%2C+K+L">Katherine L. Bouman</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.12816v1-abstract-short" style="display: inline;"> Diffusion models excel at creating visually-convincing images, but they often struggle to meet subtle constraints inherent in the training data. Such constraints could be physics-based (e.g., satisfying a PDE), geometric (e.g., respecting symmetry), or semantic (e.g., including a particular number of objects). When the training data all satisfy a certain constraint, enforcing this constraint on a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.12816v1-abstract-full').style.display = 'inline'; document.getElementById('2406.12816v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.12816v1-abstract-full" style="display: none;"> Diffusion models excel at creating visually-convincing images, but they often struggle to meet subtle constraints inherent in the training data. Such constraints could be physics-based (e.g., satisfying a PDE), geometric (e.g., respecting symmetry), or semantic (e.g., including a particular number of objects). When the training data all satisfy a certain constraint, enforcing this constraint on a diffusion model not only improves its distribution-matching accuracy but also makes it more reliable for generating valid synthetic data and solving constrained inverse problems. However, existing methods for constrained diffusion models are inflexible with different types of constraints. Recent work proposed to learn mirror diffusion models (MDMs) in an unconstrained space defined by a mirror map and to impose the constraint with an inverse mirror map, but analytical mirror maps are challenging to derive for complex constraints. We propose neural approximate mirror maps (NAMMs) for general constraints. Our approach only requires a differentiable distance function from the constraint set. We learn an approximate mirror map that pushes data into an unconstrained space and a corresponding approximate inverse that maps data back to the constraint set. A generative model, such as an MDM, can then be trained in the learned mirror space and its samples restored to the constraint set by the inverse map. We validate our approach on a variety of constraints, showing that compared to an unconstrained diffusion model, a NAMM-based MDM substantially improves constraint satisfaction. We also demonstrate how existing diffusion-based inverse-problem solvers can be easily applied in the learned mirror space to solve constrained inverse problems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.12816v1-abstract-full').style.display = 'none'; document.getElementById('2406.12816v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.12355">arXiv:2406.12355</a> <span> [<a href="https://arxiv.org/pdf/2406.12355">pdf</a>, <a href="https://arxiv.org/format/2406.12355">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> LiCAF: LiDAR-Camera Asymmetric Fusion for Gait Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Deng%2C+Y">Yunze Deng</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+H">Haijun Xiong</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+B">Bin Feng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.12355v1-abstract-short" style="display: inline;"> Gait recognition is a biometric technology that identifies individuals by using walking patterns. Due to the significant achievements of multimodal fusion in gait recognition, we consider employing LiDAR-camera fusion to obtain robust gait representations. However, existing methods often overlook intrinsic characteristics of modalities, and lack fine-grained fusion and temporal modeling. In this p… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.12355v1-abstract-full').style.display = 'inline'; document.getElementById('2406.12355v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.12355v1-abstract-full" style="display: none;"> Gait recognition is a biometric technology that identifies individuals by using walking patterns. Due to the significant achievements of multimodal fusion in gait recognition, we consider employing LiDAR-camera fusion to obtain robust gait representations. However, existing methods often overlook intrinsic characteristics of modalities, and lack fine-grained fusion and temporal modeling. In this paper, we introduce a novel modality-sensitive network LiCAF for LiDAR-camera fusion, which employs an asymmetric modeling strategy. Specifically, we propose Asymmetric Cross-modal Channel Attention (ACCA) and Interlaced Cross-modal Temporal Modeling (ICTM) for cross-modal valuable channel information selection and powerful temporal modeling. Our method achieves state-of-the-art performance (93.9% in Rank-1 and 98.8% in Rank-5) on the SUSTech1K dataset, demonstrating its effectiveness. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.12355v1-abstract-full').style.display = 'none'; document.getElementById('2406.12355v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICIP2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.08814">arXiv:2406.08814</a> <span> [<a href="https://arxiv.org/pdf/2406.08814">pdf</a>, <a href="https://arxiv.org/format/2406.08814">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Skim then Focus: Integrating Contextual and Fine-grained Views for Repetitive Action Counting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Zhengqi Zhao</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+X">Xiaohu Huang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+H">Hao Zhou</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+K">Kun Yao</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+E">Errui Ding</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jingdong Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xinggang Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+W">Wenyu Liu</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+B">Bin Feng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.08814v1-abstract-short" style="display: inline;"> The key to action counting is accurately locating each video's repetitive actions. Instead of estimating the probability of each frame belonging to an action directly, we propose a dual-branch network, i.e., SkimFocusNet, working in a two-step manner. The model draws inspiration from empirical observations indicating that humans typically engage in coarse skimming of entire sequences to grasp the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.08814v1-abstract-full').style.display = 'inline'; document.getElementById('2406.08814v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.08814v1-abstract-full" style="display: none;"> The key to action counting is accurately locating each video's repetitive actions. Instead of estimating the probability of each frame belonging to an action directly, we propose a dual-branch network, i.e., SkimFocusNet, working in a two-step manner. The model draws inspiration from empirical observations indicating that humans typically engage in coarse skimming of entire sequences to grasp the general action pattern initially, followed by a finer, frame-by-frame focus to determine if it aligns with the target action. Specifically, SkimFocusNet incorporates a skim branch and a focus branch. The skim branch scans the global contextual information throughout the sequence to identify potential target action for guidance. Subsequently, the focus branch utilizes the guidance to diligently identify repetitive actions using a long-short adaptive guidance (LSAG) block. Additionally, we have observed that videos in existing datasets often feature only one type of repetitive action, which inadequately represents real-world scenarios. To more accurately describe real-life situations, we establish the Multi-RepCount dataset, which includes videos containing multiple repetitive motions. On Multi-RepCount, our SkimFoucsNet can perform specified action counting, that is, to enable counting a particular action type by referencing an exemplary video. This capability substantially exhibits the robustness of our method. Extensive experiments demonstrate that SkimFocusNet achieves state-of-the-art performances with significant improvements. We also conduct a thorough ablation study to evaluate the network components. The source code will be published upon acceptance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.08814v1-abstract-full').style.display = 'none'; document.getElementById('2406.08814v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 9 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.02785">arXiv:2406.02785</a> <span> [<a href="https://arxiv.org/pdf/2406.02785">pdf</a>, <a href="https://arxiv.org/format/2406.02785">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Instrumentation and Methods for Astrophysics">astro-ph.IM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.3847/1538-4357/ad737f">10.3847/1538-4357/ad737f <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Event-horizon-scale Imaging of M87* under Different Assumptions via Deep Generative Image Priors </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Feng%2C+B+T">Berthy T. Feng</a>, <a href="/search/cs?searchtype=author&query=Bouman%2C+K+L">Katherine L. Bouman</a>, <a href="/search/cs?searchtype=author&query=Freeman%2C+W+T">William T. Freeman</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.02785v2-abstract-short" style="display: inline;"> Reconstructing images from the Event Horizon Telescope (EHT) observations of M87*, the supermassive black hole at the center of the galaxy M87, depends on a prior to impose desired image statistics. However, given the impossibility of directly observing black holes, there is no clear choice for a prior. We present a framework for flexibly designing a range of priors, each bringing different biases… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.02785v2-abstract-full').style.display = 'inline'; document.getElementById('2406.02785v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.02785v2-abstract-full" style="display: none;"> Reconstructing images from the Event Horizon Telescope (EHT) observations of M87*, the supermassive black hole at the center of the galaxy M87, depends on a prior to impose desired image statistics. However, given the impossibility of directly observing black holes, there is no clear choice for a prior. We present a framework for flexibly designing a range of priors, each bringing different biases to the image reconstruction. These priors can be weak (e.g., impose only basic natural-image statistics) or strong (e.g., impose assumptions of black-hole structure). Our framework uses Bayesian inference with score-based priors, which are data-driven priors arising from a deep generative model that can learn complicated image distributions. Using our Bayesian imaging approach with sophisticated data-driven priors, we can assess how visual features and uncertainty of reconstructed images change depending on the prior. In addition to simulated data, we image the real EHT M87* data and discuss how recovered features are influenced by the choice of prior. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.02785v2-abstract-full').style.display = 'none'; document.getElementById('2406.02785v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> ApJ 975 201 (2024) </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.20334">arXiv:2405.20334</a> <span> [<a href="https://arxiv.org/pdf/2405.20334">pdf</a>, <a href="https://arxiv.org/format/2405.20334">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> VividDream: Generating 3D Scene with Ambient Dynamics </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lee%2C+Y">Yao-Chih Lee</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yi-Ting Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+A">Andrew Wang</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+T">Ting-Hsuan Liao</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+B+Y">Brandon Y. Feng</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jia-Bin Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.20334v1-abstract-short" style="display: inline;"> We introduce VividDream, a method for generating explorable 4D scenes with ambient dynamics from a single input image or text prompt. VividDream first expands an input image into a static 3D point cloud through iterative inpainting and geometry merging. An ensemble of animated videos is then generated using video diffusion models with quality refinement techniques and conditioned on renderings of… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.20334v1-abstract-full').style.display = 'inline'; document.getElementById('2405.20334v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.20334v1-abstract-full" style="display: none;"> We introduce VividDream, a method for generating explorable 4D scenes with ambient dynamics from a single input image or text prompt. VividDream first expands an input image into a static 3D point cloud through iterative inpainting and geometry merging. An ensemble of animated videos is then generated using video diffusion models with quality refinement techniques and conditioned on renderings of the static 3D scene from the sampled camera trajectories. We then optimize a canonical 4D scene representation using an animated video ensemble, with per-video motion embeddings and visibility masks to mitigate inconsistencies. The resulting 4D scene enables free-view exploration of a 3D scene with plausible ambient scene dynamics. Experiments demonstrate that VividDream can provide human viewers with compelling 4D experiences generated based on diverse real images and text prompts. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.20334v1-abstract-full').style.display = 'none'; document.getElementById('2405.20334v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://vivid-dream-4d.github.io</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.06814">arXiv:2405.06814</a> <span> [<a href="https://arxiv.org/pdf/2405.06814">pdf</a>, <a href="https://arxiv.org/format/2405.06814">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1038/s41598-024-79090-y">10.1038/s41598-024-79090-y <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Dual-Task Vision Transformer for Rapid and Accurate Intracerebral Hemorrhage CT Image Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fan%2C+J">Jialiang Fan</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+X">Xinhui Fan</a>, <a href="/search/cs?searchtype=author&query=Song%2C+C">Chengyan Song</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiaofan Wang</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+B">Bingdong Feng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Lucan Li</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+G">Guoyu Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.06814v3-abstract-short" style="display: inline;"> Intracerebral hemorrhage (ICH) is a severe and sudden medical condition caused by the rupture of blood vessels in the brain, leading to permanent damage to brain tissue and often resulting in functional disabilities or death in patients. Diagnosis and analysis of ICH typically rely on brain CT imaging. Given the urgency of ICH conditions, early treatment is crucial, necessitating rapid analysis of… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.06814v3-abstract-full').style.display = 'inline'; document.getElementById('2405.06814v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.06814v3-abstract-full" style="display: none;"> Intracerebral hemorrhage (ICH) is a severe and sudden medical condition caused by the rupture of blood vessels in the brain, leading to permanent damage to brain tissue and often resulting in functional disabilities or death in patients. Diagnosis and analysis of ICH typically rely on brain CT imaging. Given the urgency of ICH conditions, early treatment is crucial, necessitating rapid analysis of CT images to formulate tailored treatment plans. However, the complexity of ICH CT images and the frequent scarcity of specialist radiologists pose significant challenges. Therefore, we collect a dataset from the real world for ICH and normal classification and three types of ICH image classification based on the hemorrhage location, i.e., Deep, Subcortical, and Lobar. In addition, we propose a neural network structure, dual-task vision transformer (DTViT), for the automated classification and diagnosis of ICH images. The DTViT deploys the encoder from the Vision Transformer (ViT), employing attention mechanisms for feature extraction from CT images. The proposed DTViT framework also incorporates two multilayer perception (MLP)-based decoders to simultaneously identify the presence of ICH and classify the three types of hemorrhage locations. Experimental results demonstrate that DTViT performs well on the real-world test dataset. The code and newly collected dataset for this work are available at: https://github.com/jfan1997/DTViT. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.06814v3-abstract-full').style.display = 'none'; document.getElementById('2405.06814v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.04434">arXiv:2405.04434</a> <span> [<a href="https://arxiv.org/pdf/2405.04434">pdf</a>, <a href="https://arxiv.org/format/2405.04434">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> DeepSeek-V2: A Strong, Economical, and Efficient Mixture-of-Experts Language Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=DeepSeek-AI"> DeepSeek-AI</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+A">Aixin Liu</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+B">Bei Feng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bin Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bingxuan Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+B">Bo Liu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+C">Chenggang Zhao</a>, <a href="/search/cs?searchtype=author&query=Dengr%2C+C">Chengqi Dengr</a>, <a href="/search/cs?searchtype=author&query=Ruan%2C+C">Chong Ruan</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+D">Damai Dai</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+D">Daya Guo</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+D">Dejian Yang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+D">Deli Chen</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+D">Dongjie Ji</a>, <a href="/search/cs?searchtype=author&query=Li%2C+E">Erhang Li</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+F">Fangyun Lin</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+F">Fuli Luo</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+G">Guangbo Hao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+G">Guanting Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Guowei Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">H. Zhang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+H">Hanwei Xu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+H">Hao Yang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Haowei Zhang</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+H">Honghui Ding</a> , et al. (132 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.04434v5-abstract-short" style="display: inline;"> We present DeepSeek-V2, a strong Mixture-of-Experts (MoE) language model characterized by economical training and efficient inference. It comprises 236B total parameters, of which 21B are activated for each token, and supports a context length of 128K tokens. DeepSeek-V2 adopts innovative architectures including Multi-head Latent Attention (MLA) and DeepSeekMoE. MLA guarantees efficient inference… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.04434v5-abstract-full').style.display = 'inline'; document.getElementById('2405.04434v5-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.04434v5-abstract-full" style="display: none;"> We present DeepSeek-V2, a strong Mixture-of-Experts (MoE) language model characterized by economical training and efficient inference. It comprises 236B total parameters, of which 21B are activated for each token, and supports a context length of 128K tokens. DeepSeek-V2 adopts innovative architectures including Multi-head Latent Attention (MLA) and DeepSeekMoE. MLA guarantees efficient inference through significantly compressing the Key-Value (KV) cache into a latent vector, while DeepSeekMoE enables training strong models at an economical cost through sparse computation. Compared with DeepSeek 67B, DeepSeek-V2 achieves significantly stronger performance, and meanwhile saves 42.5% of training costs, reduces the KV cache by 93.3%, and boosts the maximum generation throughput to 5.76 times. We pretrain DeepSeek-V2 on a high-quality and multi-source corpus consisting of 8.1T tokens, and further perform Supervised Fine-Tuning (SFT) and Reinforcement Learning (RL) to fully unlock its potential. Evaluation results show that, even with only 21B activated parameters, DeepSeek-V2 and its chat versions still achieve top-tier performance among open-source models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.04434v5-abstract-full').style.display = 'none'; document.getElementById('2405.04434v5-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.15014">arXiv:2404.15014</a> <span> [<a href="https://arxiv.org/pdf/2404.15014">pdf</a>, <a href="https://arxiv.org/format/2404.15014">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> OccGen: Generative Multi-modal 3D Occupancy Prediction for Autonomous Driving </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guoqing Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhongdao Wang</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+P">Pin Tang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+J">Jilai Zheng</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+X">Xiangxuan Ren</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+B">Bailan Feng</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+C">Chao Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.15014v1-abstract-short" style="display: inline;"> Existing solutions for 3D semantic occupancy prediction typically treat the task as a one-shot 3D voxel-wise segmentation perception problem. These discriminative methods focus on learning the mapping between the inputs and occupancy map in a single step, lacking the ability to gradually refine the occupancy map and the reasonable scene imaginative capacity to complete the local regions somewhere.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.15014v1-abstract-full').style.display = 'inline'; document.getElementById('2404.15014v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.15014v1-abstract-full" style="display: none;"> Existing solutions for 3D semantic occupancy prediction typically treat the task as a one-shot 3D voxel-wise segmentation perception problem. These discriminative methods focus on learning the mapping between the inputs and occupancy map in a single step, lacking the ability to gradually refine the occupancy map and the reasonable scene imaginative capacity to complete the local regions somewhere. In this paper, we introduce OccGen, a simple yet powerful generative perception model for the task of 3D semantic occupancy prediction. OccGen adopts a ''noise-to-occupancy'' generative paradigm, progressively inferring and refining the occupancy map by predicting and eliminating noise originating from a random Gaussian distribution. OccGen consists of two main components: a conditional encoder that is capable of processing multi-modal inputs, and a progressive refinement decoder that applies diffusion denoising using the multi-modal features as conditions. A key insight of this generative pipeline is that the diffusion denoising process is naturally able to model the coarse-to-fine refinement of the dense 3D occupancy map, therefore producing more detailed predictions. Extensive experiments on several occupancy benchmarks demonstrate the effectiveness of the proposed method compared to the state-of-the-art methods. For instance, OccGen relatively enhances the mIoU by 9.5%, 6.3%, and 13.3% on nuScenes-Occupancy dataset under the muli-modal, LiDAR-only, and camera-only settings, respectively. Moreover, as a generative perception model, OccGen exhibits desirable properties that discriminative models cannot achieve, such as providing uncertainty estimates alongside its multiple-step predictions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.15014v1-abstract-full').style.display = 'none'; document.getElementById('2404.15014v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.13026">arXiv:2404.13026</a> <span> [<a href="https://arxiv.org/pdf/2404.13026">pdf</a>, <a href="https://arxiv.org/format/2404.13026">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> PhysDreamer: Physics-Based Interaction with 3D Objects via Video Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tianyuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+H">Hong-Xing Yu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+R">Rundi Wu</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+B+Y">Brandon Y. Feng</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+C">Changxi Zheng</a>, <a href="/search/cs?searchtype=author&query=Snavely%2C+N">Noah Snavely</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jiajun Wu</a>, <a href="/search/cs?searchtype=author&query=Freeman%2C+W+T">William T. Freeman</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.13026v2-abstract-short" style="display: inline;"> Realistic object interactions are crucial for creating immersive virtual experiences, yet synthesizing realistic 3D object dynamics in response to novel interactions remains a significant challenge. Unlike unconditional or text-conditioned dynamics generation, action-conditioned dynamics requires perceiving the physical material properties of objects and grounding the 3D motion prediction on these… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.13026v2-abstract-full').style.display = 'inline'; document.getElementById('2404.13026v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.13026v2-abstract-full" style="display: none;"> Realistic object interactions are crucial for creating immersive virtual experiences, yet synthesizing realistic 3D object dynamics in response to novel interactions remains a significant challenge. Unlike unconditional or text-conditioned dynamics generation, action-conditioned dynamics requires perceiving the physical material properties of objects and grounding the 3D motion prediction on these properties, such as object stiffness. However, estimating physical material properties is an open problem due to the lack of material ground-truth data, as measuring these properties for real objects is highly difficult. We present PhysDreamer, a physics-based approach that endows static 3D objects with interactive dynamics by leveraging the object dynamics priors learned by video generation models. By distilling these priors, PhysDreamer enables the synthesis of realistic object responses to novel interactions, such as external forces or agent manipulations. We demonstrate our approach on diverse examples of elastic objects and evaluate the realism of the synthesized interactions through a user study. PhysDreamer takes a step towards more engaging and realistic virtual experiences by enabling static 3D objects to dynamically respond to interactive stimuli in a physically plausible manner. See our project page at https://physdreamer.github.io/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.13026v2-abstract-full').style.display = 'none'; document.getElementById('2404.13026v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project website at: https://physdreamer.github.io/ Appear on ECCV 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.09734">arXiv:2404.09734</a> <span> [<a href="https://arxiv.org/pdf/2404.09734">pdf</a>, <a href="https://arxiv.org/format/2404.09734">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Weighted Sum-Rate Maximization for Movable Antenna-Enhanced Wireless Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Feng%2C+B">Biqian Feng</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yongpeng Wu</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+X">Xiang-Gen Xia</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+C">Chengshan Xiao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.09734v1-abstract-short" style="display: inline;"> This letter investigates the weighted sum rate maximization problem in movable antenna (MA)-enhanced systems. To reduce the computational complexity, we transform it into a more tractable weighted minimum mean square error (WMMSE) problem well-suited for MA. We then adopt the WMMSE algorithm and majorization-minimization algorithm to optimize the beamforming and antenna positions, respectively. Mo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.09734v1-abstract-full').style.display = 'inline'; document.getElementById('2404.09734v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.09734v1-abstract-full" style="display: none;"> This letter investigates the weighted sum rate maximization problem in movable antenna (MA)-enhanced systems. To reduce the computational complexity, we transform it into a more tractable weighted minimum mean square error (WMMSE) problem well-suited for MA. We then adopt the WMMSE algorithm and majorization-minimization algorithm to optimize the beamforming and antenna positions, respectively. Moreover, we propose a planar movement mode, which constrains each MA to a specified area, we obtain a low-complexity closed-form solution. Numerical results demonstrate that the MA-enhanced system outperforms the conventional system. Besides, the computation time for the planar movement mode is reduced by approximately 30\% at a little performance expense. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.09734v1-abstract-full').style.display = 'none'; document.getElementById('2404.09734v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by IEEE Wireless Communications Letters</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.09502">arXiv:2404.09502</a> <span> [<a href="https://arxiv.org/pdf/2404.09502">pdf</a>, <a href="https://arxiv.org/format/2404.09502">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SparseOcc: Rethinking Sparse Latent Representation for Vision-Based Semantic Occupancy Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tang%2C+P">Pin Tang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhongdao Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guoqing Wang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+J">Jilai Zheng</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+X">Xiangxuan Ren</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+B">Bailan Feng</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+C">Chao Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.09502v1-abstract-short" style="display: inline;"> Vision-based perception for autonomous driving requires an explicit modeling of a 3D space, where 2D latent representations are mapped and subsequent 3D operators are applied. However, operating on dense latent spaces introduces a cubic time and space complexity, which limits scalability in terms of perception range or spatial resolution. Existing approaches compress the dense representation using… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.09502v1-abstract-full').style.display = 'inline'; document.getElementById('2404.09502v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.09502v1-abstract-full" style="display: none;"> Vision-based perception for autonomous driving requires an explicit modeling of a 3D space, where 2D latent representations are mapped and subsequent 3D operators are applied. However, operating on dense latent spaces introduces a cubic time and space complexity, which limits scalability in terms of perception range or spatial resolution. Existing approaches compress the dense representation using projections like Bird's Eye View (BEV) or Tri-Perspective View (TPV). Although efficient, these projections result in information loss, especially for tasks like semantic occupancy prediction. To address this, we propose SparseOcc, an efficient occupancy network inspired by sparse point cloud processing. It utilizes a lossless sparse latent representation with three key innovations. Firstly, a 3D sparse diffuser performs latent completion using spatially decomposed 3D sparse convolutional kernels. Secondly, a feature pyramid and sparse interpolation enhance scales with information from others. Finally, the transformer head is redesigned as a sparse variant. SparseOcc achieves a remarkable 74.9% reduction on FLOPs over the dense baseline. Interestingly, it also improves accuracy, from 12.8% to 14.1% mIOU, which in part can be attributed to the sparse representation's ability to avoid hallucinations on empty voxels. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.09502v1-abstract-full').style.display = 'none'; document.getElementById('2404.09502v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 4 figures, accepted by CVPR 2024</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> IEEE Conference on Computer Vision and Pattern Recognition 2024 (CVPR 2024) </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.07985">arXiv:2404.07985</a> <span> [<a href="https://arxiv.org/pdf/2404.07985">pdf</a>, <a href="https://arxiv.org/format/2404.07985">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> WaveMo: Learning Wavefront Modulations to See Through Scattering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xie%2C+M">Mingyang Xie</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+H">Haiyun Guo</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+B+Y">Brandon Y. Feng</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+L">Lingbo Jin</a>, <a href="/search/cs?searchtype=author&query=Veeraraghavan%2C+A">Ashok Veeraraghavan</a>, <a href="/search/cs?searchtype=author&query=Metzler%2C+C+A">Christopher A. Metzler</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.07985v1-abstract-short" style="display: inline;"> Imaging through scattering media is a fundamental and pervasive challenge in fields ranging from medical diagnostics to astronomy. A promising strategy to overcome this challenge is wavefront modulation, which induces measurement diversity during image acquisition. Despite its importance, designing optimal wavefront modulations to image through scattering remains under-explored. This paper introdu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.07985v1-abstract-full').style.display = 'inline'; document.getElementById('2404.07985v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.07985v1-abstract-full" style="display: none;"> Imaging through scattering media is a fundamental and pervasive challenge in fields ranging from medical diagnostics to astronomy. A promising strategy to overcome this challenge is wavefront modulation, which induces measurement diversity during image acquisition. Despite its importance, designing optimal wavefront modulations to image through scattering remains under-explored. This paper introduces a novel learning-based framework to address the gap. Our approach jointly optimizes wavefront modulations and a computationally lightweight feedforward "proxy" reconstruction network. This network is trained to recover scenes obscured by scattering, using measurements that are modified by these modulations. The learned modulations produced by our framework generalize effectively to unseen scattering scenarios and exhibit remarkable versatility. During deployment, the learned modulations can be decoupled from the proxy network to augment other more computationally expensive restoration algorithms. Through extensive experiments, we demonstrate our approach significantly advances the state of the art in imaging through scattering media. Our project webpage is at https://wavemo-2024.github.io/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.07985v1-abstract-full').style.display = 'none'; document.getElementById('2404.07985v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.00471">arXiv:2404.00471</a> <span> [<a href="https://arxiv.org/pdf/2404.00471">pdf</a>, <a href="https://arxiv.org/format/2404.00471">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Medical Physics">physics.med-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/ICASSP48485.2024.10447579">10.1109/ICASSP48485.2024.10447579 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Score-Based Diffusion Models for Photoacoustic Tomography Image Reconstruction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dey%2C+S">Sreemanti Dey</a>, <a href="/search/cs?searchtype=author&query=Saha%2C+S">Snigdha Saha</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+B+T">Berthy T. Feng</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+M">Manxiu Cui</a>, <a href="/search/cs?searchtype=author&query=Delisle%2C+L">Laure Delisle</a>, <a href="/search/cs?searchtype=author&query=Leong%2C+O">Oscar Leong</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L+V">Lihong V. Wang</a>, <a href="/search/cs?searchtype=author&query=Bouman%2C+K+L">Katherine L. Bouman</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.00471v1-abstract-short" style="display: inline;"> Photoacoustic tomography (PAT) is a rapidly-evolving medical imaging modality that combines optical absorption contrast with ultrasound imaging depth. One challenge in PAT is image reconstruction with inadequate acoustic signals due to limited sensor coverage or due to the density of the transducer array. Such cases call for solving an ill-posed inverse reconstruction problem. In this work, we use… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.00471v1-abstract-full').style.display = 'inline'; document.getElementById('2404.00471v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.00471v1-abstract-full" style="display: none;"> Photoacoustic tomography (PAT) is a rapidly-evolving medical imaging modality that combines optical absorption contrast with ultrasound imaging depth. One challenge in PAT is image reconstruction with inadequate acoustic signals due to limited sensor coverage or due to the density of the transducer array. Such cases call for solving an ill-posed inverse reconstruction problem. In this work, we use score-based diffusion models to solve the inverse problem of reconstructing an image from limited PAT measurements. The proposed approach allows us to incorporate an expressive prior learned by a diffusion model on simulated vessel structures while still being robust to varying transducer sparsity conditions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.00471v1-abstract-full').style.display = 'none'; document.getElementById('2404.00471v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), Seoul, Korea, Republic of, 2024, pp. 2470-2474 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.16095">arXiv:2403.16095</a> <span> [<a href="https://arxiv.org/pdf/2403.16095">pdf</a>, <a href="https://arxiv.org/format/2403.16095">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> CG-SLAM: Efficient Dense RGB-D SLAM in a Consistent Uncertainty-aware 3D Gaussian Field </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+J">Jiarui Hu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xianhao Chen</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+B">Boyin Feng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Guanglin Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+L">Liangjing Yang</a>, <a href="/search/cs?searchtype=author&query=Bao%2C+H">Hujun Bao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+G">Guofeng Zhang</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+Z">Zhaopeng Cui</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.16095v1-abstract-short" style="display: inline;"> Recently neural radiance fields (NeRF) have been widely exploited as 3D representations for dense simultaneous localization and mapping (SLAM). Despite their notable successes in surface modeling and novel view synthesis, existing NeRF-based methods are hindered by their computationally intensive and time-consuming volume rendering pipeline. This paper presents an efficient dense RGB-D SLAM system… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.16095v1-abstract-full').style.display = 'inline'; document.getElementById('2403.16095v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.16095v1-abstract-full" style="display: none;"> Recently neural radiance fields (NeRF) have been widely exploited as 3D representations for dense simultaneous localization and mapping (SLAM). Despite their notable successes in surface modeling and novel view synthesis, existing NeRF-based methods are hindered by their computationally intensive and time-consuming volume rendering pipeline. This paper presents an efficient dense RGB-D SLAM system, i.e., CG-SLAM, based on a novel uncertainty-aware 3D Gaussian field with high consistency and geometric stability. Through an in-depth analysis of Gaussian Splatting, we propose several techniques to construct a consistent and stable 3D Gaussian field suitable for tracking and mapping. Additionally, a novel depth uncertainty model is proposed to ensure the selection of valuable Gaussian primitives during optimization, thereby improving tracking efficiency and accuracy. Experiments on various datasets demonstrate that CG-SLAM achieves superior tracking and mapping performance with a notable tracking speed of up to 15 Hz. We will make our source code publicly available. Project page: https://zju3dv.github.io/cg-slam. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.16095v1-abstract-full').style.display = 'none'; document.getElementById('2403.16095v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project Page: https://zju3dv.github.io/cg-slam</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.13800">arXiv:2403.13800</a> <span> [<a href="https://arxiv.org/pdf/2403.13800">pdf</a>, <a href="https://arxiv.org/format/2403.13800">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> TimeRewind: Rewinding Time with Image-and-Events Video Diffusion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jingxi Chen</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+B+Y">Brandon Y. Feng</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+H">Haoming Cai</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+M">Mingyang Xie</a>, <a href="/search/cs?searchtype=author&query=Metzler%2C+C">Christopher Metzler</a>, <a href="/search/cs?searchtype=author&query=Fermuller%2C+C">Cornelia Fermuller</a>, <a href="/search/cs?searchtype=author&query=Aloimonos%2C+Y">Yiannis Aloimonos</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.13800v1-abstract-short" style="display: inline;"> This paper addresses the novel challenge of ``rewinding'' time from a single captured image to recover the fleeting moments missed just before the shutter button is pressed. This problem poses a significant challenge in computer vision and computational photography, as it requires predicting plausible pre-capture motion from a single static frame, an inherently ill-posed task due to the high degre… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.13800v1-abstract-full').style.display = 'inline'; document.getElementById('2403.13800v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.13800v1-abstract-full" style="display: none;"> This paper addresses the novel challenge of ``rewinding'' time from a single captured image to recover the fleeting moments missed just before the shutter button is pressed. This problem poses a significant challenge in computer vision and computational photography, as it requires predicting plausible pre-capture motion from a single static frame, an inherently ill-posed task due to the high degree of freedom in potential pixel movements. We overcome this challenge by leveraging the emerging technology of neuromorphic event cameras, which capture motion information with high temporal resolution, and integrating this data with advanced image-to-video diffusion models. Our proposed framework introduces an event motion adaptor conditioned on event camera data, guiding the diffusion model to generate videos that are visually coherent and physically grounded in the captured events. Through extensive experimentation, we demonstrate the capability of our approach to synthesize high-quality videos that effectively ``rewind'' time, showcasing the potential of combining event camera technology with generative models. Our work opens new avenues for research at the intersection of computer vision, computational photography, and generative modeling, offering a forward-thinking solution to capturing missed moments and enhancing future consumer cameras and smartphones. Please see the project page at https://timerewind.github.io/ for video results and code release. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.13800v1-abstract-full').style.display = 'none'; document.getElementById('2403.13800v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.11050">arXiv:2403.11050</a> <span> [<a href="https://arxiv.org/pdf/2403.11050">pdf</a>, <a href="https://arxiv.org/format/2403.11050">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Endora: Video Generation Models as Endoscopy Simulators </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+C">Chenxin Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Hengyu Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yifan Liu</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+B+Y">Brandon Y. Feng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wuyang Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xinyu Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhen Chen</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+J">Jing Shao</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Yixuan Yuan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.11050v1-abstract-short" style="display: inline;"> Generative models hold promise for revolutionizing medical education, robot-assisted surgery, and data augmentation for machine learning. Despite progress in generating 2D medical images, the complex domain of clinical video generation has largely remained untapped.This paper introduces \model, an innovative approach to generate medical videos that simulate clinical endoscopy scenes. We present a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.11050v1-abstract-full').style.display = 'inline'; document.getElementById('2403.11050v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.11050v1-abstract-full" style="display: none;"> Generative models hold promise for revolutionizing medical education, robot-assisted surgery, and data augmentation for machine learning. Despite progress in generating 2D medical images, the complex domain of clinical video generation has largely remained untapped.This paper introduces \model, an innovative approach to generate medical videos that simulate clinical endoscopy scenes. We present a novel generative model design that integrates a meticulously crafted spatial-temporal video transformer with advanced 2D vision foundation model priors, explicitly modeling spatial-temporal dynamics during video generation. We also pioneer the first public benchmark for endoscopy simulation with video generation models, adapting existing state-of-the-art methods for this endeavor.Endora demonstrates exceptional visual quality in generating endoscopy videos, surpassing state-of-the-art methods in extensive testing. Moreover, we explore how this endoscopy simulator can empower downstream video analysis tasks and even generate 3D medical scenes with multi-view consistency. In a nutshell, Endora marks a notable breakthrough in the deployment of generative AI for clinical endoscopy research, setting a substantial stage for further advances in medical content generation. For more details, please visit our project page: https://endora-medvidgen.github.io/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.11050v1-abstract-full').style.display = 'none'; document.getElementById('2403.11050v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://endora-medvidgen.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.11805">arXiv:2312.11805</a> <span> [<a href="https://arxiv.org/pdf/2312.11805">pdf</a>, <a href="https://arxiv.org/format/2312.11805">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Gemini: A Family of Highly Capable Multimodal Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gemini+Team"> Gemini Team</a>, <a href="/search/cs?searchtype=author&query=Anil%2C+R">Rohan Anil</a>, <a href="/search/cs?searchtype=author&query=Borgeaud%2C+S">Sebastian Borgeaud</a>, <a href="/search/cs?searchtype=author&query=Alayrac%2C+J">Jean-Baptiste Alayrac</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+J">Jiahui Yu</a>, <a href="/search/cs?searchtype=author&query=Soricut%2C+R">Radu Soricut</a>, <a href="/search/cs?searchtype=author&query=Schalkwyk%2C+J">Johan Schalkwyk</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+A+M">Andrew M. Dai</a>, <a href="/search/cs?searchtype=author&query=Hauth%2C+A">Anja Hauth</a>, <a href="/search/cs?searchtype=author&query=Millican%2C+K">Katie Millican</a>, <a href="/search/cs?searchtype=author&query=Silver%2C+D">David Silver</a>, <a href="/search/cs?searchtype=author&query=Johnson%2C+M">Melvin Johnson</a>, <a href="/search/cs?searchtype=author&query=Antonoglou%2C+I">Ioannis Antonoglou</a>, <a href="/search/cs?searchtype=author&query=Schrittwieser%2C+J">Julian Schrittwieser</a>, <a href="/search/cs?searchtype=author&query=Glaese%2C+A">Amelia Glaese</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jilin Chen</a>, <a href="/search/cs?searchtype=author&query=Pitler%2C+E">Emily Pitler</a>, <a href="/search/cs?searchtype=author&query=Lillicrap%2C+T">Timothy Lillicrap</a>, <a href="/search/cs?searchtype=author&query=Lazaridou%2C+A">Angeliki Lazaridou</a>, <a href="/search/cs?searchtype=author&query=Firat%2C+O">Orhan Firat</a>, <a href="/search/cs?searchtype=author&query=Molloy%2C+J">James Molloy</a>, <a href="/search/cs?searchtype=author&query=Isard%2C+M">Michael Isard</a>, <a href="/search/cs?searchtype=author&query=Barham%2C+P+R">Paul R. Barham</a>, <a href="/search/cs?searchtype=author&query=Hennigan%2C+T">Tom Hennigan</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+B">Benjamin Lee</a> , et al. (1325 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.11805v4-abstract-short" style="display: inline;"> This report introduces a new family of multimodal models, Gemini, that exhibit remarkable capabilities across image, audio, video, and text understanding. The Gemini family consists of Ultra, Pro, and Nano sizes, suitable for applications ranging from complex reasoning tasks to on-device memory-constrained use-cases. Evaluation on a broad range of benchmarks shows that our most-capable Gemini Ultr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.11805v4-abstract-full').style.display = 'inline'; document.getElementById('2312.11805v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.11805v4-abstract-full" style="display: none;"> This report introduces a new family of multimodal models, Gemini, that exhibit remarkable capabilities across image, audio, video, and text understanding. The Gemini family consists of Ultra, Pro, and Nano sizes, suitable for applications ranging from complex reasoning tasks to on-device memory-constrained use-cases. Evaluation on a broad range of benchmarks shows that our most-capable Gemini Ultra model advances the state of the art in 30 of 32 of these benchmarks - notably being the first model to achieve human-expert performance on the well-studied exam benchmark MMLU, and improving the state of the art in every one of the 20 multimodal benchmarks we examined. We believe that the new capabilities of the Gemini family in cross-modal reasoning and language understanding will enable a wide variety of use cases. We discuss our approach toward post-training and deploying Gemini models responsibly to users through services including Gemini, Gemini Advanced, Google AI Studio, and Cloud Vertex AI. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.11805v4-abstract-full').style.display = 'none'; document.getElementById('2312.11805v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.04679">arXiv:2312.04679</a> <span> [<a href="https://arxiv.org/pdf/2312.04679">pdf</a>, <a href="https://arxiv.org/format/2312.04679">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ConVRT: Consistent Video Restoration Through Turbulence with Test-time Optimization of Neural Video Representations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cai%2C+H">Haoming Cai</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jingxi Chen</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+B+Y">Brandon Y. Feng</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+W">Weiyun Jiang</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+M">Mingyang Xie</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kevin Zhang</a>, <a href="/search/cs?searchtype=author&query=Veeraraghavan%2C+A">Ashok Veeraraghavan</a>, <a href="/search/cs?searchtype=author&query=Metzler%2C+C">Christopher Metzler</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.04679v1-abstract-short" style="display: inline;"> tmospheric turbulence presents a significant challenge in long-range imaging. Current restoration algorithms often struggle with temporal inconsistency, as well as limited generalization ability across varying turbulence levels and scene content different than the training data. To tackle these issues, we introduce a self-supervised method, Consistent Video Restoration through Turbulence (ConVRT)… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.04679v1-abstract-full').style.display = 'inline'; document.getElementById('2312.04679v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.04679v1-abstract-full" style="display: none;"> tmospheric turbulence presents a significant challenge in long-range imaging. Current restoration algorithms often struggle with temporal inconsistency, as well as limited generalization ability across varying turbulence levels and scene content different than the training data. To tackle these issues, we introduce a self-supervised method, Consistent Video Restoration through Turbulence (ConVRT) a test-time optimization method featuring a neural video representation designed to enhance temporal consistency in restoration. A key innovation of ConVRT is the integration of a pretrained vision-language model (CLIP) for semantic-oriented supervision, which steers the restoration towards sharp, photorealistic images in the CLIP latent space. We further develop a principled selection strategy of text prompts, based on their statistical correlation with a perceptual metric. ConVRT's test-time optimization allows it to adapt to a wide range of real-world turbulence conditions, effectively leveraging the insights gained from pre-trained models on simulated data. ConVRT offers a comprehensive and effective solution for mitigating real-world turbulence in dynamic videos. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.04679v1-abstract-full').style.display = 'none'; document.getElementById('2312.04679v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">https://convrt-2024.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.03788">arXiv:2312.03788</a> <span> [<a href="https://arxiv.org/pdf/2312.03788">pdf</a>, <a href="https://arxiv.org/format/2312.03788">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> SmoothQuant+: Accurate and Efficient 4-bit Post-Training WeightQuantization for LLM </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Pan%2C+J">Jiayi Pan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chengcan Wang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+K">Kaifu Zheng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yangguang Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhenyu Wang</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+B">Bin Feng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.03788v1-abstract-short" style="display: inline;"> Large language models (LLMs) have shown remarkable capabilities in various tasks. However their huge model size and the consequent demand for computational and memory resources also pose challenges to model deployment. Currently, 4-bit post-training quantization (PTQ) has achieved some success in LLMs, reducing the memory footprint by approximately 75% compared to FP16 models, albeit with some acc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.03788v1-abstract-full').style.display = 'inline'; document.getElementById('2312.03788v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.03788v1-abstract-full" style="display: none;"> Large language models (LLMs) have shown remarkable capabilities in various tasks. However their huge model size and the consequent demand for computational and memory resources also pose challenges to model deployment. Currently, 4-bit post-training quantization (PTQ) has achieved some success in LLMs, reducing the memory footprint by approximately 75% compared to FP16 models, albeit with some accuracy loss. In this paper, we propose SmoothQuant+, an accurate and efficient 4-bit weight-only PTQ that requires no additional training, which enables lossless in accuracy for LLMs for the first time. Based on the fact that the loss of weight quantization is amplified by the activation outliers, SmoothQuant+ smoothes the activation outliers by channel before quantization, while adjusting the corresponding weights for mathematical equivalence, and then performs group-wise 4-bit weight quantization for linear layers. We have integrated SmoothQuant+ into the vLLM framework, an advanced high-throughput inference engine specially developed for LLMs, and equipped it with an efficient W4A16 CUDA kernels, so that vLLM can seamlessly support SmoothQuant+ 4-bit weight quantization. Our results show that, with SmoothQuant+, the Code Llama-34B model can be quantized and deployed on a A100 40GB GPU, achieving lossless accuracy and a throughput increase of 1.9 to 4.0 times compared to the FP16 model deployed on two A100 40GB GPUs. Moreover, the latency per token is only 68% of the FP16 model deployed on two A100 40GB GPUs. This is the state-of-the-art 4-bit weight quantization for LLMs as we know. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.03788v1-abstract-full').style.display = 'none'; document.getElementById('2312.03788v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.01195">arXiv:2312.01195</a> <span> [<a href="https://arxiv.org/pdf/2312.01195">pdf</a>, <a href="https://arxiv.org/format/2312.01195">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> AIM: Automatic Interrupt Modeling for Dynamic Firmware Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Feng%2C+B">Bo Feng</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+M">Meng Luo</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+C">Changming Liu</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+L">Long Lu</a>, <a href="/search/cs?searchtype=author&query=Kirda%2C+E">Engin Kirda</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.01195v1-abstract-short" style="display: inline;"> The security of microcontrollers, which drive modern IoT and embedded devices, continues to raise major concerns. Within a microcontroller (MCU), the firmware is a monolithic piece of software that contains the whole software stack, whereas a variety of peripherals represent the hardware. As MCU firmware contains vulnerabilities, it is ideal to test firmware with off-the-shelf software testing tec… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.01195v1-abstract-full').style.display = 'inline'; document.getElementById('2312.01195v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.01195v1-abstract-full" style="display: none;"> The security of microcontrollers, which drive modern IoT and embedded devices, continues to raise major concerns. Within a microcontroller (MCU), the firmware is a monolithic piece of software that contains the whole software stack, whereas a variety of peripherals represent the hardware. As MCU firmware contains vulnerabilities, it is ideal to test firmware with off-the-shelf software testing techniques, such as dynamic symbolic execution and fuzzing. Nevertheless, no emulator can emulate the diverse MCU peripherals or execute/test the firmware. Specifically, the interrupt interface, among all I/O interfaces used by MCU peripherals, is extremely challenging to emulate. In this paper, we present AIM -- a generic, scalable, and hardware-independent dynamic firmware analysis framework that supports unemulated MCU peripherals by a novel interrupt modeling mechanism. AIM effectively and efficiently covers interrupt-dependent code in firmware by a novel, firmware-guided, Just-in-Time Interrupt Firing technique. We implemented our framework in angr and performed dynamic symbolic execution for eight real-world MCU firmware. According to testing results, our framework covered up to 11.2 times more interrupt-dependent code than state-of-the-art approaches while accomplishing several challenging goals not feasible previously. Finally, a comparison with a state-of-the-art firmware fuzzer demonstrates dynamic symbolic execution and fuzzing together can achieve better firmware testing coverage. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.01195v1-abstract-full').style.display = 'none'; document.getElementById('2312.01195v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper was accepted to IEEE Transactions on Dependable and Secure Computing at Oct 12, 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.10835">arXiv:2310.10835</a> <span> [<a href="https://arxiv.org/pdf/2310.10835">pdf</a>, <a href="https://arxiv.org/format/2310.10835">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Provable Probabilistic Imaging using Score-Based Generative Priors </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yu Sun</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Zihui Wu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yifan Chen</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+B+T">Berthy T. Feng</a>, <a href="/search/cs?searchtype=author&query=Bouman%2C+K+L">Katherine L. Bouman</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.10835v3-abstract-short" style="display: inline;"> Estimating high-quality images while also quantifying their uncertainty are two desired features in an image reconstruction algorithm for solving ill-posed inverse problems. In this paper, we propose plug-and-play Monte Carlo (PMC) as a principled framework for characterizing the space of possible solutions to a general inverse problem. PMC is able to incorporate expressive score-based generative… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.10835v3-abstract-full').style.display = 'inline'; document.getElementById('2310.10835v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.10835v3-abstract-full" style="display: none;"> Estimating high-quality images while also quantifying their uncertainty are two desired features in an image reconstruction algorithm for solving ill-posed inverse problems. In this paper, we propose plug-and-play Monte Carlo (PMC) as a principled framework for characterizing the space of possible solutions to a general inverse problem. PMC is able to incorporate expressive score-based generative priors for high-quality image reconstruction while also performing uncertainty quantification via posterior sampling. In particular, we develop two PMC algorithms that can be viewed as the sampling analogues of the traditional plug-and-play priors (PnP) and regularization by denoising (RED) algorithms. To improve the sampling efficiency, we introduce weighted annealing into these PMC algorithms, further developing two additional annealed PMC algorithms (APMC). We establish a theoretical analysis for characterizing the convergence behavior of PMC algorithms. Our analysis provides non-asymptotic stationarity guarantees in terms of the Fisher information, fully compatible with the joint presence of weighted annealing, potentially non-log-concave likelihoods, and imperfect score networks. We demonstrate the performance of the PMC algorithms on multiple representative inverse problems with both linear and nonlinear forward models. Experimental results show that PMC significantly improves reconstruction quality and enables high-fidelity uncertainty quantification. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.10835v3-abstract-full').style.display = 'none'; document.getElementById('2310.10835v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.06504">arXiv:2310.06504</a> <span> [<a href="https://arxiv.org/pdf/2310.06504">pdf</a>, <a href="https://arxiv.org/format/2310.06504">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Revisit Input Perturbation Problems for LLMs: A Unified Robustness Evaluation Framework for Noisy Slot Filling Task </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dong%2C+G">Guanting Dong</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+J">Jinxu Zhao</a>, <a href="/search/cs?searchtype=author&query=Hui%2C+T">Tingfeng Hui</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+D">Daichi Guo</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+W">Wenlong Wan</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+B">Boqi Feng</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+Y">Yueyan Qiu</a>, <a href="/search/cs?searchtype=author&query=Gongque%2C+Z">Zhuoma Gongque</a>, <a href="/search/cs?searchtype=author&query=He%2C+K">Keqing He</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zechen Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+W">Weiran Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.06504v1-abstract-short" style="display: inline;"> With the increasing capabilities of large language models (LLMs), these high-performance models have achieved state-of-the-art results on a wide range of natural language processing (NLP) tasks. However, the models' performance on commonly-used benchmark datasets often fails to accurately reflect their reliability and robustness when applied to real-world noisy data. To address these challenges, w… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.06504v1-abstract-full').style.display = 'inline'; document.getElementById('2310.06504v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.06504v1-abstract-full" style="display: none;"> With the increasing capabilities of large language models (LLMs), these high-performance models have achieved state-of-the-art results on a wide range of natural language processing (NLP) tasks. However, the models' performance on commonly-used benchmark datasets often fails to accurately reflect their reliability and robustness when applied to real-world noisy data. To address these challenges, we propose a unified robustness evaluation framework based on the slot-filling task to systematically evaluate the dialogue understanding capability of LLMs in diverse input perturbation scenarios. Specifically, we construct a input perturbation evaluation dataset, Noise-LLM, which contains five types of single perturbation and four types of mixed perturbation data. Furthermore, we utilize a multi-level data augmentation method (character, word, and sentence levels) to construct a candidate data pool, and carefully design two ways of automatic task demonstration construction strategies (instance-level and entity-level) with various prompt templates. Our aim is to assess how well various robustness methods of LLMs perform in real-world noisy scenarios. The experiments have demonstrated that the current open-source LLMs generally achieve limited perturbation robustness performance. Based on these experimental observations, we make some forward-looking suggestions to fuel the research in this direction. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.06504v1-abstract-full').style.display = 'none'; document.getElementById('2310.06504v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at NLPCC 2023 (Oral Presentation)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.03125">arXiv:2310.03125</a> <span> [<a href="https://arxiv.org/pdf/2310.03125">pdf</a>, <a href="https://arxiv.org/format/2310.03125">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Shielding the Unseen: Privacy Protection through Poisoning NeRF with Spatial Deformation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yihan Wu</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+B+Y">Brandon Y. Feng</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+H">Heng Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.03125v1-abstract-short" style="display: inline;"> In this paper, we introduce an innovative method of safeguarding user privacy against the generative capabilities of Neural Radiance Fields (NeRF) models. Our novel poisoning attack method induces changes to observed views that are imperceptible to the human eye, yet potent enough to disrupt NeRF's ability to accurately reconstruct a 3D scene. To achieve this, we devise a bi-level optimization alg… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.03125v1-abstract-full').style.display = 'inline'; document.getElementById('2310.03125v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.03125v1-abstract-full" style="display: none;"> In this paper, we introduce an innovative method of safeguarding user privacy against the generative capabilities of Neural Radiance Fields (NeRF) models. Our novel poisoning attack method induces changes to observed views that are imperceptible to the human eye, yet potent enough to disrupt NeRF's ability to accurately reconstruct a 3D scene. To achieve this, we devise a bi-level optimization algorithm incorporating a Projected Gradient Descent (PGD)-based spatial deformation. We extensively test our approach on two common NeRF benchmark datasets consisting of 29 real-world scenes with high-quality images. Our results compellingly demonstrate that our privacy-preserving method significantly impairs NeRF's performance across these benchmark datasets. Additionally, we show that our method is adaptable and versatile, functioning across various perturbation strengths and NeRF architectures. This work offers valuable insights into NeRF's vulnerabilities and emphasizes the need to account for such potential privacy risks when developing robust 3D scene reconstruction algorithms. Our study contributes to the larger conversation surrounding responsible AI and generative machine learning, aiming to protect user privacy and respect creative ownership in the digital age. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.03125v1-abstract-full').style.display = 'none'; document.getElementById('2310.03125v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.17293">arXiv:2309.17293</a> <span> [<a href="https://arxiv.org/pdf/2309.17293">pdf</a>, <a href="https://arxiv.org/format/2309.17293">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantum Physics">quant-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Emerging Technologies">cs.ET</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1007/s10773-023-05382-0">10.1007/s10773-023-05382-0 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Quantum Privacy-preserving Two-party Circle Intersection Protocol Based on Phase-encoded Query </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zi-Xian Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Q">Qi Yang</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+B">Bao Feng</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+W">Wen-Jie Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.17293v1-abstract-short" style="display: inline;"> Privacy-preserving geometric intersection (PGI) is an important issue in Secure multiparty computation (SMC). The existing quantum PGI protocols are mainly based on grid coding, which requires a lot of computational complexity. The phase-encoded query method which has been used in some Quantum SMC protocols is suitable to solve the decision problem, but it needs to apply high dimensional Oracle op… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.17293v1-abstract-full').style.display = 'inline'; document.getElementById('2309.17293v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.17293v1-abstract-full" style="display: none;"> Privacy-preserving geometric intersection (PGI) is an important issue in Secure multiparty computation (SMC). The existing quantum PGI protocols are mainly based on grid coding, which requires a lot of computational complexity. The phase-encoded query method which has been used in some Quantum SMC protocols is suitable to solve the decision problem, but it needs to apply high dimensional Oracle operators. In this paper, we use the principle of phase-encoded query to solve an important PGI problem, namely privacy-preserving two-party circle intersection. We study the implementation of Oracle operator in detail, and achieve polynomial computational complexity by decompsing it into quantum arithmetic operations. Performance analysis shows that our protocol is correct and efficient, and can protect the privacy of all participants against internal and external attacks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.17293v1-abstract-full').style.display = 'none'; document.getElementById('2309.17293v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">16 pages, 2 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> International Journal of Theoretical Physics,2023.62(7):p.138 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.14349">arXiv:2309.14349</a> <span> [<a href="https://arxiv.org/pdf/2309.14349">pdf</a>, <a href="https://arxiv.org/format/2309.14349">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Corporate Credit Rating: A Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Feng%2C+B">Bojing Feng</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+X">Xi Cheng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Dan Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zeyu Liu</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+W">Wenfang Xue</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.14349v1-abstract-short" style="display: inline;"> Corporate credit rating (CCR) plays a very important role in the process of contemporary economic and social development. How to use credit rating methods for enterprises has always been a problem worthy of discussion. Through reading and studying the relevant literature at home and abroad, this paper makes a systematic survey of CCR. This paper combs the context of the development of CCR methods… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.14349v1-abstract-full').style.display = 'inline'; document.getElementById('2309.14349v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.14349v1-abstract-full" style="display: none;"> Corporate credit rating (CCR) plays a very important role in the process of contemporary economic and social development. How to use credit rating methods for enterprises has always been a problem worthy of discussion. Through reading and studying the relevant literature at home and abroad, this paper makes a systematic survey of CCR. This paper combs the context of the development of CCR methods from the three levels: statistical models, machine learning models and neural network models, summarizes the common databases of CCR, and deeply compares the advantages and disadvantages of the models. Finally, this paper summarizes the problems existing in the current research and prospects the future of CCR. Compared with the existing review of CCR, this paper expounds and analyzes the progress of neural network model in this field in recent years. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.14349v1-abstract-full').style.display = 'none'; document.getElementById('2309.14349v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.11591">arXiv:2309.11591</a> <span> [<a href="https://arxiv.org/pdf/2309.11591">pdf</a>, <a href="https://arxiv.org/format/2309.11591">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> Continuous Levels of Detail for Light Field Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+D">David Li</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+B+Y">Brandon Y. Feng</a>, <a href="/search/cs?searchtype=author&query=Varshney%2C+A">Amitabh Varshney</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.11591v1-abstract-short" style="display: inline;"> Recently, several approaches have emerged for generating neural representations with multiple levels of detail (LODs). LODs can improve the rendering by using lower resolutions and smaller model sizes when appropriate. However, existing methods generally focus on a few discrete LODs which suffer from aliasing and flicker artifacts as details are changed and limit their granularity for adapting to… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.11591v1-abstract-full').style.display = 'inline'; document.getElementById('2309.11591v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.11591v1-abstract-full" style="display: none;"> Recently, several approaches have emerged for generating neural representations with multiple levels of detail (LODs). LODs can improve the rendering by using lower resolutions and smaller model sizes when appropriate. However, existing methods generally focus on a few discrete LODs which suffer from aliasing and flicker artifacts as details are changed and limit their granularity for adapting to resource limitations. In this paper, we propose a method to encode light field networks with continuous LODs, allowing for finely tuned adaptations to rendering conditions. Our training procedure uses summed-area table filtering allowing efficient and continuous filtering at various LODs. Furthermore, we use saliency-based importance sampling which enables our light field networks to distribute their capacity, particularly limited at lower LODs, towards representing the details viewers are most likely to focus on. Incorporating continuous LODs into neural representations enables progressive streaming of neural representations, decreasing the latency and resource utilization for rendering. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.11591v1-abstract-full').style.display = 'none'; document.getElementById('2309.11591v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to BMVC 2023. Webpage at https://augmentariumlab.github.io/continuous-lfn/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.01949">arXiv:2309.01949</a> <span> [<a href="https://arxiv.org/pdf/2309.01949">pdf</a>, <a href="https://arxiv.org/format/2309.01949">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Variational Bayesian Imaging with an Efficient Surrogate Score-based Prior </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Feng%2C+B+T">Berthy T. Feng</a>, <a href="/search/cs?searchtype=author&query=Bouman%2C+K+L">Katherine L. Bouman</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.01949v2-abstract-short" style="display: inline;"> We propose a surrogate function for efficient yet principled use of score-based priors in Bayesian imaging. We consider ill-posed inverse imaging problems in which one aims for a clean image posterior given incomplete or noisy measurements. Since the measurements do not uniquely determine a true image, a prior is needed to constrain the solution space. Recent work turned score-based diffusion mode… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.01949v2-abstract-full').style.display = 'inline'; document.getElementById('2309.01949v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.01949v2-abstract-full" style="display: none;"> We propose a surrogate function for efficient yet principled use of score-based priors in Bayesian imaging. We consider ill-posed inverse imaging problems in which one aims for a clean image posterior given incomplete or noisy measurements. Since the measurements do not uniquely determine a true image, a prior is needed to constrain the solution space. Recent work turned score-based diffusion models into principled priors for solving ill-posed imaging problems by appealing to an ODE-based log-probability function. However, evaluating the ODE is computationally inefficient and inhibits posterior estimation of high-dimensional images. Our proposed surrogate prior is based on the evidence lower bound of a score-based diffusion model. We demonstrate the surrogate prior on variational inference for efficient approximate posterior sampling of large images. Compared to the exact prior in previous work, our surrogate accelerates optimization of the variational image distribution by at least two orders of magnitude. We also find that our principled approach gives more accurate posterior estimation than non-variational diffusion-based approaches that involve hyperparameter-tuning at inference. Our work establishes a practical path forward for using score-based diffusion models as general-purpose image priors. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.01949v2-abstract-full').style.display = 'none'; document.getElementById('2309.01949v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published in Transactions on Machine Learning Research (TMLR) August 2024</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Feng%2C+B&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Feng%2C+B&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Feng%2C+B&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Feng%2C+B&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>