Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 83 results for author: <span class="mathjax">Bao, C</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Bao%2C+C">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Bao, C"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Bao%2C+C&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Bao, C"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Bao%2C+C&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Bao%2C+C&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Bao%2C+C&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05054">arXiv:2502.05054</a> <span> [<a href="https://arxiv.org/pdf/2502.05054">pdf</a>, <a href="https://arxiv.org/format/2502.05054">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> Mining a Decade of Event Impacts on Contributor Dynamics in Ethereum: A Longitudinal Study </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Vaccargiu%2C+M">Matteo Vaccargiu</a>, <a href="/search/cs?searchtype=author&query=Aufiero%2C+S">Sabrina Aufiero</a>, <a href="/search/cs?searchtype=author&query=Ba%2C+C">Cheick Ba</a>, <a href="/search/cs?searchtype=author&query=Bartolucci%2C+S">Silvia Bartolucci</a>, <a href="/search/cs?searchtype=author&query=Clegg%2C+R">Richard Clegg</a>, <a href="/search/cs?searchtype=author&query=Graziotin%2C+D">Daniel Graziotin</a>, <a href="/search/cs?searchtype=author&query=Neykova%2C+R">Rumyana Neykova</a>, <a href="/search/cs?searchtype=author&query=Tonelli%2C+R">Roberto Tonelli</a>, <a href="/search/cs?searchtype=author&query=Destefanis%2C+G">Giuseppe Destefanis</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05054v1-abstract-short" style="display: inline;"> We analyze developer activity across 10 major Ethereum repositories (totaling 129884 commits, 40550 issues) spanning 10 years to examine how events such as technical upgrades, market events, and community decisions impact development. Through statistical, survival, and network analyses, we find that technical events prompt increased activity before the event, followed by reduced commit rates after… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05054v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05054v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05054v1-abstract-full" style="display: none;"> We analyze developer activity across 10 major Ethereum repositories (totaling 129884 commits, 40550 issues) spanning 10 years to examine how events such as technical upgrades, market events, and community decisions impact development. Through statistical, survival, and network analyses, we find that technical events prompt increased activity before the event, followed by reduced commit rates afterwards, whereas market events lead to more reactive development. Core infrastructure repositories like Go-Ethereum exhibit faster issue resolution compared to developer tools, and technical events enhance core team collaboration. Our findings show how different types of events shape development dynamics, offering insights for project managers and developers in maintaining development momentum through major transitions. This work contributes to understanding the resilience of development communities and their adaptation to ecosystem changes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05054v1-abstract-full').style.display = 'none'; document.getElementById('2502.05054v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 9 figures. To appear in 22nd IEEE/ACM International Conference on Mining Software Repositories, MSR 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.04799">arXiv:2502.04799</a> <span> [<a href="https://arxiv.org/pdf/2502.04799">pdf</a>, <a href="https://arxiv.org/format/2502.04799">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> A Regularized Newton Method for Nonconvex Optimization with Global and Local Complexity Guarantees </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yuhao Zhou</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jintao Xu</a>, <a href="/search/cs?searchtype=author&query=Bao%2C+C">Chenglong Bao</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+C">Chao Ding</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jun Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.04799v2-abstract-short" style="display: inline;"> We consider the problem of finding an $蔚$-stationary point of a nonconvex function with a Lipschitz continuous Hessian and propose a quadratic regularized Newton method incorporating a new class of regularizers constructed from the current and previous gradients. The method leverages a recently developed linear conjugate gradient approach with a negative curvature monitor to solve the regularized… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04799v2-abstract-full').style.display = 'inline'; document.getElementById('2502.04799v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.04799v2-abstract-full" style="display: none;"> We consider the problem of finding an $蔚$-stationary point of a nonconvex function with a Lipschitz continuous Hessian and propose a quadratic regularized Newton method incorporating a new class of regularizers constructed from the current and previous gradients. The method leverages a recently developed linear conjugate gradient approach with a negative curvature monitor to solve the regularized Newton equation. Notably, our algorithm is adaptive, requiring no prior knowledge of the Lipschitz constant of the Hessian, and achieves a global complexity of $O(蔚^{-\frac{3}{2}}) + \tilde O(1)$ in terms of the second-order oracle calls, and $\tilde O(蔚^{-\frac{7}{4}})$ for Hessian-vector products, respectively. Moreover, when the iterates converge to a point where the Hessian is positive definite, the method exhibits quadratic local convergence. Preliminary numerical results illustrate the competitiveness of our algorithm. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04799v2-abstract-full').style.display = 'none'; document.getElementById('2502.04799v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.13187">arXiv:2412.13187</a> <span> [<a href="https://arxiv.org/pdf/2412.13187">pdf</a>, <a href="https://arxiv.org/format/2412.13187">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> HandsOnVLM: Vision-Language Models for Hand-Object Interaction Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bao%2C+C">Chen Bao</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jiarui Xu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiaolong Wang</a>, <a href="/search/cs?searchtype=author&query=Gupta%2C+A">Abhinav Gupta</a>, <a href="/search/cs?searchtype=author&query=Bharadhwaj%2C+H">Homanga Bharadhwaj</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.13187v2-abstract-short" style="display: inline;"> How can we predict future interaction trajectories of human hands in a scene given high-level colloquial task specifications in the form of natural language? In this paper, we extend the classic hand trajectory prediction task to two tasks involving explicit or implicit language queries. Our proposed tasks require extensive understanding of human daily activities and reasoning abilities about what… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.13187v2-abstract-full').style.display = 'inline'; document.getElementById('2412.13187v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.13187v2-abstract-full" style="display: none;"> How can we predict future interaction trajectories of human hands in a scene given high-level colloquial task specifications in the form of natural language? In this paper, we extend the classic hand trajectory prediction task to two tasks involving explicit or implicit language queries. Our proposed tasks require extensive understanding of human daily activities and reasoning abilities about what should be happening next given cues from the current scene. We also develop new benchmarks to evaluate the proposed two tasks, Vanilla Hand Prediction (VHP) and Reasoning-Based Hand Prediction (RBHP). We enable solving these tasks by integrating high-level world knowledge and reasoning capabilities of Vision-Language Models (VLMs) with the auto-regressive nature of low-level ego-centric hand trajectories. Our model, HandsOnVLM is a novel VLM that can generate textual responses and produce future hand trajectories through natural-language conversations. Our experiments show that HandsOnVLM outperforms existing task-specific methods and other VLM baselines on proposed tasks, and demonstrates its ability to effectively utilize world knowledge for reasoning about low-level human hand trajectories based on the provided context. Our website contains code and detailed video results https://www.chenbao.tech/handsonvlm/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.13187v2-abstract-full').style.display = 'none'; document.getElementById('2412.13187v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Preprint. Under Review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.09428">arXiv:2412.09428</a> <span> [<a href="https://arxiv.org/pdf/2412.09428">pdf</a>, <a href="https://arxiv.org/format/2412.09428">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Multimodal Music Generation with Explicit Bridges and Retrieval Augmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+B">Baisen Wang</a>, <a href="/search/cs?searchtype=author&query=Zhuo%2C+L">Le Zhuo</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhaokai Wang</a>, <a href="/search/cs?searchtype=author&query=Bao%2C+C">Chenxi Bao</a>, <a href="/search/cs?searchtype=author&query=Chengjing%2C+W">Wu Chengjing</a>, <a href="/search/cs?searchtype=author&query=Nie%2C+X">Xuecheng Nie</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+J">Jiao Dai</a>, <a href="/search/cs?searchtype=author&query=Han%2C+J">Jizhong Han</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+Y">Yue Liao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Si Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.09428v1-abstract-short" style="display: inline;"> Multimodal music generation aims to produce music from diverse input modalities, including text, videos, and images. Existing methods use a common embedding space for multimodal fusion. Despite their effectiveness in other modalities, their application in multimodal music generation faces challenges of data scarcity, weak cross-modal alignment, and limited controllability. This paper addresses the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.09428v1-abstract-full').style.display = 'inline'; document.getElementById('2412.09428v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.09428v1-abstract-full" style="display: none;"> Multimodal music generation aims to produce music from diverse input modalities, including text, videos, and images. Existing methods use a common embedding space for multimodal fusion. Despite their effectiveness in other modalities, their application in multimodal music generation faces challenges of data scarcity, weak cross-modal alignment, and limited controllability. This paper addresses these issues by using explicit bridges of text and music for multimodal alignment. We introduce a novel method named Visuals Music Bridge (VMB). Specifically, a Multimodal Music Description Model converts visual inputs into detailed textual descriptions to provide the text bridge; a Dual-track Music Retrieval module that combines broad and targeted retrieval strategies to provide the music bridge and enable user control. Finally, we design an Explicitly Conditioned Music Generation framework to generate music based on the two bridges. We conduct experiments on video-to-music, image-to-music, text-to-music, and controllable music generation tasks, along with experiments on controllability. The results demonstrate that VMB significantly enhances music quality, modality, and customization alignment compared to previous methods. VMB sets a new standard for interpretable and expressive multimodal music generation with applications in various multimedia fields. Demos and code are available at https://github.com/wbs2788/VMB. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.09428v1-abstract-full').style.display = 'none'; document.getElementById('2412.09428v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14110">arXiv:2411.14110</a> <span> [<a href="https://arxiv.org/pdf/2411.14110">pdf</a>, <a href="https://arxiv.org/format/2411.14110">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> RAG-Thief: Scalable Extraction of Private Data from Retrieval-Augmented Generation Applications with Agent-based Attacks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+C">Changyue Jiang</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+X">Xudong Pan</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+G">Geng Hong</a>, <a href="/search/cs?searchtype=author&query=Bao%2C+C">Chenfu Bao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+M">Min Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14110v1-abstract-short" style="display: inline;"> While large language models (LLMs) have achieved notable success in generative tasks, they still face limitations, such as lacking up-to-date knowledge and producing hallucinations. Retrieval-Augmented Generation (RAG) enhances LLM performance by integrating external knowledge bases, providing additional context which significantly improves accuracy and knowledge coverage. However, building these… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14110v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14110v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14110v1-abstract-full" style="display: none;"> While large language models (LLMs) have achieved notable success in generative tasks, they still face limitations, such as lacking up-to-date knowledge and producing hallucinations. Retrieval-Augmented Generation (RAG) enhances LLM performance by integrating external knowledge bases, providing additional context which significantly improves accuracy and knowledge coverage. However, building these external knowledge bases often requires substantial resources and may involve sensitive information. In this paper, we propose an agent-based automated privacy attack called RAG-Thief, which can extract a scalable amount of private data from the private database used in RAG applications. We conduct a systematic study on the privacy risks associated with RAG applications, revealing that the vulnerability of LLMs makes the private knowledge bases suffer significant privacy risks. Unlike previous manual attacks which rely on traditional prompt injection techniques, RAG-Thief starts with an initial adversarial query and learns from model responses, progressively generating new queries to extract as many chunks from the knowledge base as possible. Experimental results show that our RAG-Thief can extract over 70% information from the private knowledge bases within customized RAG applications deployed on local machines and real-world platforms, including OpenAI's GPTs and ByteDance's Coze. Our findings highlight the privacy vulnerabilities in current RAG applications and underscore the pressing need for stronger safeguards. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14110v1-abstract-full').style.display = 'none'; document.getElementById('2411.14110v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10632">arXiv:2411.10632</a> <span> [<a href="https://arxiv.org/pdf/2411.10632">pdf</a>, <a href="https://arxiv.org/format/2411.10632">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> </div> </div> <p class="title is-5 mathjax"> Quantifying community evolution in temporal networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhong%2C+P">Peijie Zhong</a>, <a href="/search/cs?searchtype=author&query=Ba%2C+C">Cheick Ba</a>, <a href="/search/cs?searchtype=author&query=Mondrag%C3%B3n%2C+R">Ra煤l Mondrag贸n</a>, <a href="/search/cs?searchtype=author&query=Clegg%2C+R">Richard Clegg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10632v2-abstract-short" style="display: inline;"> When we detect communities in temporal networks it is important to ask questions about how they change in time. Normalised Mutual Information (NMI) has been used to measure the similarity of communities when the nodes on a network do not change. We propose two extensions namely Union-Normalised Mutual Information (UNMI) and Intersection-Normalised Mutual Information (INMI). UNMI and INMI evaluate… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10632v2-abstract-full').style.display = 'inline'; document.getElementById('2411.10632v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10632v2-abstract-full" style="display: none;"> When we detect communities in temporal networks it is important to ask questions about how they change in time. Normalised Mutual Information (NMI) has been used to measure the similarity of communities when the nodes on a network do not change. We propose two extensions namely Union-Normalised Mutual Information (UNMI) and Intersection-Normalised Mutual Information (INMI). UNMI and INMI evaluate the similarity of community structure under the condition of node variation. Experiments show that these methods are effective in dealing with temporal networks with the changes in the set of nodes, and can capture the dynamic evolution of community structure in both synthetic and real temporal networks. This study not only provides a new similarity measurement method for network analysis but also helps to deepen the understanding of community change in complex temporal networks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10632v2-abstract-full').style.display = 'none'; document.getElementById('2411.10632v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.00425">arXiv:2410.00425</a> <span> [<a href="https://arxiv.org/pdf/2410.00425">pdf</a>, <a href="https://arxiv.org/format/2410.00425">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> ManiSkill3: GPU Parallelized Robotics Simulation and Rendering for Generalizable Embodied AI </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tao%2C+S">Stone Tao</a>, <a href="/search/cs?searchtype=author&query=Xiang%2C+F">Fanbo Xiang</a>, <a href="/search/cs?searchtype=author&query=Shukla%2C+A">Arth Shukla</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+Y">Yuzhe Qin</a>, <a href="/search/cs?searchtype=author&query=Hinrichsen%2C+X">Xander Hinrichsen</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+X">Xiaodi Yuan</a>, <a href="/search/cs?searchtype=author&query=Bao%2C+C">Chen Bao</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+X">Xinsong Lin</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yulin Liu</a>, <a href="/search/cs?searchtype=author&query=Chan%2C+T">Tse-kai Chan</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yuan Gao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xuanlin Li</a>, <a href="/search/cs?searchtype=author&query=Mu%2C+T">Tongzhou Mu</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+N">Nan Xiao</a>, <a href="/search/cs?searchtype=author&query=Gurha%2C+A">Arnav Gurha</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Z">Zhiao Huang</a>, <a href="/search/cs?searchtype=author&query=Calandra%2C+R">Roberto Calandra</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+R">Rui Chen</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+S">Shan Luo</a>, <a href="/search/cs?searchtype=author&query=Su%2C+H">Hao Su</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.00425v1-abstract-short" style="display: inline;"> Simulation has enabled unprecedented compute-scalable approaches to robot learning. However, many existing simulation frameworks typically support a narrow range of scenes/tasks and lack features critical for scaling generalizable robotics and sim2real. We introduce and open source ManiSkill3, the fastest state-visual GPU parallelized robotics simulator with contact-rich physics targeting generali… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00425v1-abstract-full').style.display = 'inline'; document.getElementById('2410.00425v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.00425v1-abstract-full" style="display: none;"> Simulation has enabled unprecedented compute-scalable approaches to robot learning. However, many existing simulation frameworks typically support a narrow range of scenes/tasks and lack features critical for scaling generalizable robotics and sim2real. We introduce and open source ManiSkill3, the fastest state-visual GPU parallelized robotics simulator with contact-rich physics targeting generalizable manipulation. ManiSkill3 supports GPU parallelization of many aspects including simulation+rendering, heterogeneous simulation, pointclouds/voxels visual input, and more. Simulation with rendering on ManiSkill3 can run 10-1000x faster with 2-3x less GPU memory usage than other platforms, achieving up to 30,000+ FPS in benchmarked environments due to minimal python/pytorch overhead in the system, simulation on the GPU, and the use of the SAPIEN parallel rendering system. Tasks that used to take hours to train can now take minutes. We further provide the most comprehensive range of GPU parallelized environments/tasks spanning 12 distinct domains including but not limited to mobile manipulation for tasks such as drawing, humanoids, and dextrous manipulation in realistic scenes designed by artists or real-world digital twins. In addition, millions of demonstration frames are provided from motion planning, RL, and teleoperation. ManiSkill3 also provides a comprehensive set of baselines that span popular RL and learning-from-demonstrations algorithms. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00425v1-abstract-full').style.display = 'none'; document.getElementById('2410.00425v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project website: http://maniskill.ai/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.05938">arXiv:2409.05938</a> <span> [<a href="https://arxiv.org/pdf/2409.05938">pdf</a>, <a href="https://arxiv.org/format/2409.05938">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantitative Methods">q-bio.QM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> DeepFM-Crispr: Prediction of CRISPR On-Target Effects via Deep Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bao%2C+C">Condy Bao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+F">Fuxiao Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.05938v1-abstract-short" style="display: inline;"> Since the advent of CRISPR-Cas9, a groundbreaking gene-editing technology that enables precise genomic modifications via a short RNA guide sequence, there has been a marked increase in the accessibility and application of this technology across various fields. The success of CRISPR-Cas9 has spurred further investment and led to the discovery of additional CRISPR systems, including CRISPR-Cas13. Di… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.05938v1-abstract-full').style.display = 'inline'; document.getElementById('2409.05938v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.05938v1-abstract-full" style="display: none;"> Since the advent of CRISPR-Cas9, a groundbreaking gene-editing technology that enables precise genomic modifications via a short RNA guide sequence, there has been a marked increase in the accessibility and application of this technology across various fields. The success of CRISPR-Cas9 has spurred further investment and led to the discovery of additional CRISPR systems, including CRISPR-Cas13. Distinct from Cas9, which targets DNA, Cas13 targets RNA, offering unique advantages for gene modulation. We focus on Cas13d, a variant known for its collateral activity where it non-specifically cleaves adjacent RNA molecules upon activation, a feature critical to its function. We introduce DeepFM-Crispr, a novel deep learning model developed to predict the on-target efficiency and evaluate the off-target effects of Cas13d. This model harnesses a large language model to generate comprehensive representations rich in evolutionary and structural data, thereby enhancing predictions of RNA secondary structures and overall sgRNA efficacy. A transformer-based architecture processes these inputs to produce a predictive efficacy score. Comparative experiments show that DeepFM-Crispr not only surpasses traditional models but also outperforms recent state-of-the-art deep learning methods in terms of prediction accuracy and reliability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.05938v1-abstract-full').style.display = 'none'; document.getElementById('2409.05938v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 page, 2 figures, accepted to ICMLA 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.05212">arXiv:2409.05212</a> <span> [<a href="https://arxiv.org/pdf/2409.05212">pdf</a>, <a href="https://arxiv.org/format/2409.05212">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> SS-BRPE: Self-Supervised Blind Room Parameter Estimation Using Attention Mechanisms </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chunxi Wang</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+M">Maoshen Jia</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Meiran Li</a>, <a href="/search/cs?searchtype=author&query=Bao%2C+C">Changchun Bao</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+W">Wenyu Jin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.05212v1-abstract-short" style="display: inline;"> In recent years, dynamic parameterization of acoustic environments has garnered attention in audio processing. This focus includes room volume and reverberation time (RT60), which define local acoustics independent of sound source and receiver orientation. Previous studies show that purely attention-based models can achieve advanced results in room parameter estimation. However, their success reli… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.05212v1-abstract-full').style.display = 'inline'; document.getElementById('2409.05212v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.05212v1-abstract-full" style="display: none;"> In recent years, dynamic parameterization of acoustic environments has garnered attention in audio processing. This focus includes room volume and reverberation time (RT60), which define local acoustics independent of sound source and receiver orientation. Previous studies show that purely attention-based models can achieve advanced results in room parameter estimation. However, their success relies on supervised pretrainings that require a large amount of labeled true values for room parameters and complex training pipelines. In light of this, we propose a novel Self-Supervised Blind Room Parameter Estimation (SS-BRPE) system. This system combines a purely attention-based model with self-supervised learning to estimate room acoustic parameters, from single-channel noisy speech signals. By utilizing unlabeled audio data for pretraining, the proposed system significantly reduces dependencies on costly labeled datasets. Our model also incorporates dynamic feature augmentation during fine-tuning to enhance adaptability and generalizability. Experimental results demonstrate that the SS-BRPE system not only achieves more superior performance in estimating room parameters than state-of-the-art (SOTA) methods but also effectively maintains high accuracy under conditions with limited labeled data. Code available at https://github.com/bjut-chunxiwang/SS-BRPE. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.05212v1-abstract-full').style.display = 'none'; document.getElementById('2409.05212v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 3 figures, submitted to ICASSP 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.05707">arXiv:2408.05707</a> <span> [<a href="https://arxiv.org/pdf/2408.05707">pdf</a>, <a href="https://arxiv.org/format/2408.05707">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Fast and Scalable Semi-Supervised Learning for Multi-View Subspace Clustering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ling%2C+H">Huaming Ling</a>, <a href="/search/cs?searchtype=author&query=Bao%2C+C">Chenglong Bao</a>, <a href="/search/cs?searchtype=author&query=Song%2C+J">Jiebo Song</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Z">Zuoqiang Shi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.05707v1-abstract-short" style="display: inline;"> In this paper, we introduce a Fast and Scalable Semi-supervised Multi-view Subspace Clustering (FSSMSC) method, a novel solution to the high computational complexity commonly found in existing approaches. FSSMSC features linear computational and space complexity relative to the size of the data. The method generates a consensus anchor graph across all views, representing each data point as a spars… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.05707v1-abstract-full').style.display = 'inline'; document.getElementById('2408.05707v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.05707v1-abstract-full" style="display: none;"> In this paper, we introduce a Fast and Scalable Semi-supervised Multi-view Subspace Clustering (FSSMSC) method, a novel solution to the high computational complexity commonly found in existing approaches. FSSMSC features linear computational and space complexity relative to the size of the data. The method generates a consensus anchor graph across all views, representing each data point as a sparse linear combination of chosen landmarks. Unlike traditional methods that manage the anchor graph construction and the label propagation process separately, this paper proposes a unified optimization model that facilitates simultaneous learning of both. An effective alternating update algorithm with convergence guarantees is proposed to solve the unified optimization model. Additionally, the method employs the obtained anchor graph and landmarks' low-dimensional representations to deduce low-dimensional representations for raw data. Following this, a straightforward clustering approach is conducted on these low-dimensional representations to achieve the final clustering results. The effectiveness and efficiency of FSSMSC are validated through extensive experiments on multiple benchmark datasets of varying scales. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.05707v1-abstract-full').style.display = 'none'; document.getElementById('2408.05707v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">40 pages,7 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.05419">arXiv:2408.05419</a> <span> [<a href="https://arxiv.org/pdf/2408.05419">pdf</a>, <a href="https://arxiv.org/format/2408.05419">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Interface Laplace Learning: Learnable Interface Term Helps Semi-Supervised Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+T">Tangjun Wang</a>, <a href="/search/cs?searchtype=author&query=Bao%2C+C">Chenglong Bao</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Z">Zuoqiang Shi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.05419v1-abstract-short" style="display: inline;"> We introduce a novel framework, called Interface Laplace learning, for graph-based semi-supervised learning. Motivated by the observation that an interface should exist between different classes where the function value is non-smooth, we introduce a Laplace learning model that incorporates an interface term. This model challenges the long-standing assumption that functions are smooth at all unlabe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.05419v1-abstract-full').style.display = 'inline'; document.getElementById('2408.05419v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.05419v1-abstract-full" style="display: none;"> We introduce a novel framework, called Interface Laplace learning, for graph-based semi-supervised learning. Motivated by the observation that an interface should exist between different classes where the function value is non-smooth, we introduce a Laplace learning model that incorporates an interface term. This model challenges the long-standing assumption that functions are smooth at all unlabeled points. In the proposed approach, we add an interface term to the Laplace learning model at the interface positions. We provide a practical algorithm to approximate the interface positions using k-hop neighborhood indices, and to learn the interface term from labeled data without artificial design. Our method is efficient and effective, and we present extensive experiments demonstrating that Interface Laplace learning achieves better performance than other recent semi-supervised learning approaches at extremely low label rates on the MNIST, FashionMNIST, and CIFAR-10 datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.05419v1-abstract-full').style.display = 'none'; document.getElementById('2408.05419v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.10614">arXiv:2407.10614</a> <span> [<a href="https://arxiv.org/pdf/2407.10614">pdf</a>, <a href="https://arxiv.org/format/2407.10614">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> </div> </div> <p class="title is-5 mathjax"> Investigating shocking events in the Ethereum stablecoin ecosystem through temporal multilayer graph structure </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ba%2C+C+T">Cheick Tidiane Ba</a>, <a href="/search/cs?searchtype=author&query=Clegg%2C+R+G">Richard G. Clegg</a>, <a href="/search/cs?searchtype=author&query=Steer%2C+B+A">Ben A. Steer</a>, <a href="/search/cs?searchtype=author&query=Zignani%2C+M">Matteo Zignani</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.10614v1-abstract-short" style="display: inline;"> In the dynamic landscape of the Web, we are witnessing the emergence of the Web3 paradigm, which dictates that platforms should rely on blockchain technology and cryptocurrencies to sustain themselves and their profitability. Cryptocurrencies are characterised by high market volatility and susceptibility to substantial crashes, issues that require temporal analysis methodologies able to tackle the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.10614v1-abstract-full').style.display = 'inline'; document.getElementById('2407.10614v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.10614v1-abstract-full" style="display: none;"> In the dynamic landscape of the Web, we are witnessing the emergence of the Web3 paradigm, which dictates that platforms should rely on blockchain technology and cryptocurrencies to sustain themselves and their profitability. Cryptocurrencies are characterised by high market volatility and susceptibility to substantial crashes, issues that require temporal analysis methodologies able to tackle the high temporal resolution, heterogeneity and scale of blockchain data. While existing research attempts to analyse crash events, fundamental questions persist regarding the optimal time scale for analysis, differentiation between long-term and short-term trends, and the identification and characterisation of shock events within these decentralised systems. This paper addresses these issues by examining cryptocurrencies traded on the Ethereum blockchain, with a spotlight on the crash of the stablecoin TerraUSD and the currency LUNA designed to stabilise it. Utilising complex network analysis and a multi-layer temporal graph allows the study of the correlations between the layers representing the currencies and system evolution across diverse time scales. The investigation sheds light on the strong interconnections among stablecoins pre-crash and the significant post-crash transformations. We identify anomalous signals before, during, and after the collapse, emphasising their impact on graph structure metrics and user movement across layers. This paper pioneers temporal, cross-chain graph analysis to explore a cryptocurrency collapse. It emphasises the importance of temporal analysis for studies on web-derived data and how graph-based analysis can enhance traditional econometric results. Overall, this research carries implications beyond its field, for example for regulatory agencies aiming to safeguard users from shocks and monitor investment risks for citizens and clients. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.10614v1-abstract-full').style.display = 'none'; document.getElementById('2407.10614v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.11875">arXiv:2406.11875</a> <span> [<a href="https://arxiv.org/pdf/2406.11875">pdf</a>, <a href="https://arxiv.org/format/2406.11875">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> ChatPCG: Large Language Model-Driven Reward Design for Procedural Content Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Baek%2C+I">In-Chang Baek</a>, <a href="/search/cs?searchtype=author&query=Park%2C+T">Tae-Hwa Park</a>, <a href="/search/cs?searchtype=author&query=Noh%2C+J">Jin-Ha Noh</a>, <a href="/search/cs?searchtype=author&query=Bae%2C+C">Cheong-Mok Bae</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+K">Kyung-Joong Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.11875v1-abstract-short" style="display: inline;"> Driven by the rapid growth of machine learning, recent advances in game artificial intelligence (AI) have significantly impacted productivity across various gaming genres. Reward design plays a pivotal role in training game AI models, wherein researchers implement concepts of specific reward functions. However, despite the presence of AI, the reward design process predominantly remains in the doma… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.11875v1-abstract-full').style.display = 'inline'; document.getElementById('2406.11875v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.11875v1-abstract-full" style="display: none;"> Driven by the rapid growth of machine learning, recent advances in game artificial intelligence (AI) have significantly impacted productivity across various gaming genres. Reward design plays a pivotal role in training game AI models, wherein researchers implement concepts of specific reward functions. However, despite the presence of AI, the reward design process predominantly remains in the domain of human experts, as it is heavily reliant on their creativity and engineering skills. Therefore, this paper proposes ChatPCG, a large language model (LLM)-driven reward design framework.It leverages human-level insights, coupled with game expertise, to generate rewards tailored to specific game features automatically. Moreover, ChatPCG is integrated with deep reinforcement learning, demonstrating its potential for multiplayer game content generation tasks. The results suggest that the proposed LLM exhibits the capability to comprehend game mechanics and content generation tasks, enabling tailored content generation for a specified game. This study not only highlights the potential for improving accessibility in content generation but also aims to streamline the game AI development process. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.11875v1-abstract-full').style.display = 'none'; document.getElementById('2406.11875v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">4 pages, 2 figures, accepted at IEEE Conference on Games 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.10260">arXiv:2405.10260</a> <span> [<a href="https://arxiv.org/pdf/2405.10260">pdf</a>, <a href="https://arxiv.org/format/2405.10260">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Keep It Private: Unsupervised Privatization of Online Text </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bao%2C+C">Calvin Bao</a>, <a href="/search/cs?searchtype=author&query=Carpuat%2C+M">Marine Carpuat</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.10260v1-abstract-short" style="display: inline;"> Authorship obfuscation techniques hold the promise of helping people protect their privacy in online communications by automatically rewriting text to hide the identity of the original author. However, obfuscation has been evaluated in narrow settings in the NLP literature and has primarily been addressed with superficial edit operations that can lead to unnatural outputs. In this work, we introdu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.10260v1-abstract-full').style.display = 'inline'; document.getElementById('2405.10260v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.10260v1-abstract-full" style="display: none;"> Authorship obfuscation techniques hold the promise of helping people protect their privacy in online communications by automatically rewriting text to hide the identity of the original author. However, obfuscation has been evaluated in narrow settings in the NLP literature and has primarily been addressed with superficial edit operations that can lead to unnatural outputs. In this work, we introduce an automatic text privatization framework that fine-tunes a large language model via reinforcement learning to produce rewrites that balance soundness, sense, and privacy. We evaluate it extensively on a large-scale test set of English Reddit posts by 68k authors composed of short-medium length texts. We study how the performance changes among evaluative conditions including authorial profile length and authorship detection strategy. Our method maintains high text quality according to both automated metrics and human evaluation, and successfully evades several automated authorship attacks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.10260v1-abstract-full').style.display = 'none'; document.getElementById('2405.10260v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.19195">arXiv:2404.19195</a> <span> [<a href="https://arxiv.org/pdf/2404.19195">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Applied Physics">physics.app-ph</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/ITherm55368.2023.10177653">10.1109/ITherm55368.2023.10177653 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Evaluation of Thermal Performance of a Wick-free Vapor Chamber in Power Electronics Cooling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mukhopadhyay%2C+A">Arani Mukhopadhyay</a>, <a href="/search/cs?searchtype=author&query=Pal%2C+A">Anish Pal</a>, <a href="/search/cs?searchtype=author&query=Bao%2C+C">Congbo Bao</a>, <a href="/search/cs?searchtype=author&query=Gukeh%2C+M+J">Mohamad Jafari Gukeh</a>, <a href="/search/cs?searchtype=author&query=Mazumder%2C+S+K">Sudip K. Mazumder</a>, <a href="/search/cs?searchtype=author&query=Megaridis%2C+C+M">Constantine M. Megaridis</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.19195v1-abstract-short" style="display: inline;"> Efficient thermal management in high-power electronics cooling can be achieved using phase-change heat transfer devices, such as vapor chambers. Traditional vapor chambers use wicks to transport condensate for efficient thermal exchange and to prevent "dry-out" of the evaporator. However, wicks in vapor chambers present significant design challenges arising out of large pressure drops across the w… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.19195v1-abstract-full').style.display = 'inline'; document.getElementById('2404.19195v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.19195v1-abstract-full" style="display: none;"> Efficient thermal management in high-power electronics cooling can be achieved using phase-change heat transfer devices, such as vapor chambers. Traditional vapor chambers use wicks to transport condensate for efficient thermal exchange and to prevent "dry-out" of the evaporator. However, wicks in vapor chambers present significant design challenges arising out of large pressure drops across the wicking material, which slows down condensate transport rates and increases the chances for dry-out. Thicker wicks add to overall thermal resistance, while deterring the development of thinner devices by limiting the total thickness of the vapor chamber. Wickless vapor chambers eliminate the use of metal wicks entirely, by incorporating complementary wettability-patterned flat plates on both the evaporator and the condenser side. Such surface modifications enhance fluid transport on the evaporator side, while allowing the chambers to be virtually as thin as imaginable, thereby permitting design of thermally efficient thin electronic cooling devices. While wick-free vapor chambers have been studied and efficient design strategies have been suggested, we delve into real-life applications of wick-free vapor chambers in forced air cooling of high-power electronics. An experimental setup is developed wherein two Si-based MOSFETs of TO-247-3 packaging having high conduction resistance, are connected in parallel and switched at 100 kHz, to emulate high frequency power electronics operations. A rectangular copper wick-free vapor chamber spreads heat laterally over a surface 13 times larger than the heating area. This chamber is cooled externally by a fan that circulates air at room temperature. The present experimental setup extends our previous work on wick-free vapor chambers, while demonstrating the effectiveness of low-cost air cooling in vapor-chamber enhanced high-power electronics applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.19195v1-abstract-full').style.display = 'none'; document.getElementById('2404.19195v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Presented at IEEE ITherm (Intersociety Conference on Thermal and Thermomechanical Phenomena in Electronic Systems) 2023, Orlando FL. Corresponding author: cmm@uic.edu</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.10944">arXiv:2404.10944</a> <span> [<a href="https://arxiv.org/pdf/2404.10944">pdf</a>, <a href="https://arxiv.org/format/2404.10944">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Threat Behavior Textual Search by Attention Graph Isomorphism </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bae%2C+C">Chanwoo Bae</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+G">Guanhong Tao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhuo Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiangyu Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.10944v2-abstract-short" style="display: inline;"> Cyber attacks cause over \$1 trillion loss every year. An important task for cyber security analysts is attack forensics. It entails understanding malware behaviors and attack origins. However, existing automated or manual malware analysis can only disclose a subset of behaviors due to inherent difficulties (e.g., malware cloaking and obfuscation). As such, analysts often resort to text search tec… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.10944v2-abstract-full').style.display = 'inline'; document.getElementById('2404.10944v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.10944v2-abstract-full" style="display: none;"> Cyber attacks cause over \$1 trillion loss every year. An important task for cyber security analysts is attack forensics. It entails understanding malware behaviors and attack origins. However, existing automated or manual malware analysis can only disclose a subset of behaviors due to inherent difficulties (e.g., malware cloaking and obfuscation). As such, analysts often resort to text search techniques to identify existing malware reports based on the symptoms they observe, exploiting the fact that malware samples share a lot of similarity, especially those from the same origin. In this paper, we propose a novel malware behavior search technique that is based on graph isomorphism at the attention layers of Transformer models. We also compose a large dataset collected from various agencies to facilitate such research. Our technique outperforms state-of-the-art methods, such as those based on sentence embeddings and keywords by 6-14%. In the case study of 10 real-world malwares, our technique can correctly attribute 8 of them to their ground truth origins while using Google only works for 3 cases. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.10944v2-abstract-full').style.display = 'none'; document.getElementById('2404.10944v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Proceedings of the 18th Conference of the European Chapter of the Association for Computational Linguistics (Volume 1: Long Papers). 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.02152">arXiv:2404.02152</a> <span> [<a href="https://arxiv.org/pdf/2404.02152">pdf</a>, <a href="https://arxiv.org/format/2404.02152">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> GeneAvatar: Generic Expression-Aware Volumetric Head Avatar Editing from a Single Image </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bao%2C+C">Chong Bao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yinda Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yuan Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+B">Bangbang Yang</a>, <a href="/search/cs?searchtype=author&query=Bao%2C+H">Hujun Bao</a>, <a href="/search/cs?searchtype=author&query=Pollefeys%2C+M">Marc Pollefeys</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+G">Guofeng Zhang</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+Z">Zhaopeng Cui</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.02152v1-abstract-short" style="display: inline;"> Recently, we have witnessed the explosive growth of various volumetric representations in modeling animatable head avatars. However, due to the diversity of frameworks, there is no practical method to support high-level applications like 3D head avatar editing across different representations. In this paper, we propose a generic avatar editing approach that can be universally applied to various 3D… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.02152v1-abstract-full').style.display = 'inline'; document.getElementById('2404.02152v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.02152v1-abstract-full" style="display: none;"> Recently, we have witnessed the explosive growth of various volumetric representations in modeling animatable head avatars. However, due to the diversity of frameworks, there is no practical method to support high-level applications like 3D head avatar editing across different representations. In this paper, we propose a generic avatar editing approach that can be universally applied to various 3DMM driving volumetric head avatars. To achieve this goal, we design a novel expression-aware modification generative model, which enables lift 2D editing from a single image to a consistent 3D modification field. To ensure the effectiveness of the generative modification process, we develop several techniques, including an expression-dependent modification distillation scheme to draw knowledge from the large-scale head avatar model and 2D facial texture editing tools, implicit latent space guidance to enhance model convergence, and a segmentation-based loss reweight strategy for fine-grained texture inversion. Extensive experiments demonstrate that our method delivers high-quality and consistent results across multiple expression and viewpoints. Project page: https://zju3dv.github.io/geneavatar/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.02152v1-abstract-full').style.display = 'none'; document.getElementById('2404.02152v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to CVPR 2024. Project page: https://zju3dv.github.io/geneavatar/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.17502">arXiv:2403.17502</a> <span> [<a href="https://arxiv.org/pdf/2403.17502">pdf</a>, <a href="https://arxiv.org/format/2403.17502">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SeNM-VAE: Semi-Supervised Noise Modeling with Hierarchical Variational Autoencoder </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zheng%2C+D">Dihan Zheng</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+Y">Yihang Zou</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiaowen Zhang</a>, <a href="/search/cs?searchtype=author&query=Bao%2C+C">Chenglong Bao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.17502v1-abstract-short" style="display: inline;"> The data bottleneck has emerged as a fundamental challenge in learning based image restoration methods. Researchers have attempted to generate synthesized training data using paired or unpaired samples to address this challenge. This study proposes SeNM-VAE, a semi-supervised noise modeling method that leverages both paired and unpaired datasets to generate realistic degraded data. Our approach is… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.17502v1-abstract-full').style.display = 'inline'; document.getElementById('2403.17502v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.17502v1-abstract-full" style="display: none;"> The data bottleneck has emerged as a fundamental challenge in learning based image restoration methods. Researchers have attempted to generate synthesized training data using paired or unpaired samples to address this challenge. This study proposes SeNM-VAE, a semi-supervised noise modeling method that leverages both paired and unpaired datasets to generate realistic degraded data. Our approach is based on modeling the conditional distribution of degraded and clean images with a specially designed graphical model. Under the variational inference framework, we develop an objective function for handling both paired and unpaired data. We employ our method to generate paired training samples for real-world image denoising and super-resolution tasks. Our approach excels in the quality of synthetic degraded images compared to other unpaired and paired noise modeling methods. Furthermore, our approach demonstrates remarkable performance in downstream image restoration tasks, even with limited paired data. With more paired data, our method achieves the best performance on the SIDD dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.17502v1-abstract-full').style.display = 'none'; document.getElementById('2403.17502v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.15726">arXiv:2403.15726</a> <span> [<a href="https://arxiv.org/pdf/2403.15726">pdf</a>, <a href="https://arxiv.org/format/2403.15726">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Convection-Diffusion Equation: A Theoretically Certified Framework for Neural Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+T">Tangjun Wang</a>, <a href="/search/cs?searchtype=author&query=Bao%2C+C">Chenglong Bao</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Z">Zuoqiang Shi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.15726v1-abstract-short" style="display: inline;"> In this paper, we study the partial differential equation models of neural networks. Neural network can be viewed as a map from a simple base model to a complicate function. Based on solid analysis, we show that this map can be formulated by a convection-diffusion equation. This theoretically certified framework gives mathematical foundation and more understanding of neural networks. Moreover, bas… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.15726v1-abstract-full').style.display = 'inline'; document.getElementById('2403.15726v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.15726v1-abstract-full" style="display: none;"> In this paper, we study the partial differential equation models of neural networks. Neural network can be viewed as a map from a simple base model to a complicate function. Based on solid analysis, we show that this map can be formulated by a convection-diffusion equation. This theoretically certified framework gives mathematical foundation and more understanding of neural networks. Moreover, based on the convection-diffusion equation model, we design a novel network structure, which incorporates diffusion mechanism into network architecture. Extensive experiments on both benchmark datasets and real-world applications validate the performance of the proposed model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.15726v1-abstract-full').style.display = 'none'; document.getElementById('2403.15726v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.13909">arXiv:2403.13909</a> <span> [<a href="https://arxiv.org/pdf/2403.13909">pdf</a>, <a href="https://arxiv.org/format/2403.13909">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Sequential Modeling of Complex Marine Navigation: Case Study on a Passenger Vessel (Student Abstract) </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fan%2C+Y">Yimeng Fan</a>, <a href="/search/cs?searchtype=author&query=Agand%2C+P">Pedram Agand</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+M">Mo Chen</a>, <a href="/search/cs?searchtype=author&query=Park%2C+E+J">Edward J. Park</a>, <a href="/search/cs?searchtype=author&query=Kennedy%2C+A">Allison Kennedy</a>, <a href="/search/cs?searchtype=author&query=Bae%2C+C">Chanwoo Bae</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.13909v1-abstract-short" style="display: inline;"> The maritime industry's continuous commitment to sustainability has led to a dedicated exploration of methods to reduce vessel fuel consumption. This paper undertakes this challenge through a machine learning approach, leveraging a real-world dataset spanning two years of a ferry in west coast Canada. Our focus centers on the creation of a time series forecasting model given the dynamic and static… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.13909v1-abstract-full').style.display = 'inline'; document.getElementById('2403.13909v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.13909v1-abstract-full" style="display: none;"> The maritime industry's continuous commitment to sustainability has led to a dedicated exploration of methods to reduce vessel fuel consumption. This paper undertakes this challenge through a machine learning approach, leveraging a real-world dataset spanning two years of a ferry in west coast Canada. Our focus centers on the creation of a time series forecasting model given the dynamic and static states, actions, and disturbances. This model is designed to predict dynamic states based on the actions provided, subsequently serving as an evaluative tool to assess the proficiency of the ferry's operation under the captain's guidance. Additionally, it lays the foundation for future optimization algorithms, providing valuable feedback on decision-making processes. To facilitate future studies, our code is available at \url{https://github.com/pagand/model_optimze_vessel/tree/AAAI} <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.13909v1-abstract-full').style.display = 'none'; document.getElementById('2403.13909v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 3 figures, AAAI 2024 student abstract</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.07283">arXiv:2403.07283</a> <span> [<a href="https://arxiv.org/pdf/2403.07283">pdf</a>, <a href="https://arxiv.org/format/2403.07283">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> A Framework for Cost-Effective and Self-Adaptive LLM Shaking and Recovery Mechanism </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhiyu Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yu Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Suochao Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jingbo Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jiwen Zhou</a>, <a href="/search/cs?searchtype=author&query=Bao%2C+C">Chenfu Bao</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+D">Dianhai Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.07283v1-abstract-short" style="display: inline;"> As Large Language Models (LLMs) gain great success in real-world applications, an increasing number of users are seeking to develop and deploy their customized LLMs through cloud services. Nonetheless, in some specific domains, there are still concerns regarding cost and trade-offs between privacy issues and accuracy. In this study, we introduce a cost-effective and self-adaptive LLM shaking tunin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.07283v1-abstract-full').style.display = 'inline'; document.getElementById('2403.07283v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.07283v1-abstract-full" style="display: none;"> As Large Language Models (LLMs) gain great success in real-world applications, an increasing number of users are seeking to develop and deploy their customized LLMs through cloud services. Nonetheless, in some specific domains, there are still concerns regarding cost and trade-offs between privacy issues and accuracy. In this study, we introduce a cost-effective and self-adaptive LLM shaking tuning and recovery mechanism, named CypherTalk. With carefully designed horizontal and vertical shaking operators, we can achieve comparable accuracy results with SOTA privacy-preserving LLM schemes using Cryptography-based or Differential Privacy-based methods. Experiments also show that with the CypherTalk framework, users can achieve reliable accuracy when using optimized shaking operator settings. To our best knowledge, this is the first work that considers cost, and trade-off between model utility and privacy in LLM scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.07283v1-abstract-full').style.display = 'none'; document.getElementById('2403.07283v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.01873">arXiv:2403.01873</a> <span> [<a href="https://arxiv.org/pdf/2403.01873">pdf</a>, <a href="https://arxiv.org/format/2403.01873">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Recommending Missed Citations Identified by Reviewers: A New Task, Dataset and Baselines </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Long%2C+K">Kehan Long</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shasha Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+P">Pancheng Wang</a>, <a href="/search/cs?searchtype=author&query=Bao%2C+C">Chenlong Bao</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+J">Jintao Tang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+T">Ting Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.01873v1-abstract-short" style="display: inline;"> Citing comprehensively and appropriately has become a challenging task with the explosive growth of scientific publications. Current citation recommendation systems aim to recommend a list of scientific papers for a given text context or a draft paper. However, none of the existing work focuses on already included citations of full papers, which are imperfect and still have much room for improveme… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.01873v1-abstract-full').style.display = 'inline'; document.getElementById('2403.01873v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.01873v1-abstract-full" style="display: none;"> Citing comprehensively and appropriately has become a challenging task with the explosive growth of scientific publications. Current citation recommendation systems aim to recommend a list of scientific papers for a given text context or a draft paper. However, none of the existing work focuses on already included citations of full papers, which are imperfect and still have much room for improvement. In the scenario of peer reviewing, it is a common phenomenon that submissions are identified as missing vital citations by reviewers. This may lead to a negative impact on the credibility and validity of the research presented. To help improve citations of full papers, we first define a novel task of Recommending Missed Citations Identified by Reviewers (RMC) and construct a corresponding expert-labeled dataset called CitationR. We conduct an extensive evaluation of several state-of-the-art methods on CitationR. Furthermore, we propose a new framework RMCNet with an Attentive Reference Encoder module mining the relevance between papers, already-made citations, and missed citations. Empirical results prove that RMC is challenging, with the proposed architecture outperforming previous methods in all metrics. We release our dataset and benchmark models to motivate future research on this challenging new task. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.01873v1-abstract-full').style.display = 'none'; document.getElementById('2403.01873v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">COLING 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.09272">arXiv:2402.09272</a> <span> [<a href="https://arxiv.org/pdf/2402.09272">pdf</a>, <a href="https://arxiv.org/format/2402.09272">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> </div> </div> <p class="title is-5 mathjax"> Insights and caveats from mining local and global temporal motifs in cryptocurrency transaction networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Arnold%2C+N+A">Naomi A. Arnold</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+P">Peijie Zhong</a>, <a href="/search/cs?searchtype=author&query=Ba%2C+C+T">Cheick Tidiane Ba</a>, <a href="/search/cs?searchtype=author&query=Steer%2C+B">Ben Steer</a>, <a href="/search/cs?searchtype=author&query=Mondragon%2C+R">Raul Mondragon</a>, <a href="/search/cs?searchtype=author&query=Cuadrado%2C+F">Felix Cuadrado</a>, <a href="/search/cs?searchtype=author&query=Lambiotte%2C+R">Renaud Lambiotte</a>, <a href="/search/cs?searchtype=author&query=Clegg%2C+R+G">Richard G. Clegg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.09272v2-abstract-short" style="display: inline;"> Distributed ledger technologies have opened up a wealth of fine-grained transaction data from cryptocurrencies like Bitcoin and Ethereum. This allows research into problems like anomaly detection, anti-money laundering, pattern mining and activity clustering (where data from traditional currencies is rarely available). The formalism of temporal networks offers a natural way of representing this da… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.09272v2-abstract-full').style.display = 'inline'; document.getElementById('2402.09272v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.09272v2-abstract-full" style="display: none;"> Distributed ledger technologies have opened up a wealth of fine-grained transaction data from cryptocurrencies like Bitcoin and Ethereum. This allows research into problems like anomaly detection, anti-money laundering, pattern mining and activity clustering (where data from traditional currencies is rarely available). The formalism of temporal networks offers a natural way of representing this data and offers access to a wealth of metrics and models. However, the large scale of the data presents a challenge using standard graph analysis techniques. We use temporal motifs to analyse two Bitcoin datasets and one NFT dataset, using sequences of three transactions and up to three users. We show that the commonly used technique of simply counting temporal motifs over all users and all time can give misleading conclusions. Here we also study the motifs contributed by each user and discover that the motif distribution is heavy-tailed and that the key players have diverse motif signatures. We study the motifs that occur in different time periods and find events and anomalous activity that cannot be seen just by a count on the whole dataset. Studying motif completion time reveals dynamics driven by human behaviour as well as algorithmic behaviour. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.09272v2-abstract-full').style.display = 'none'; document.getElementById('2402.09272v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.05829">arXiv:2402.05829</a> <span> [<a href="https://arxiv.org/pdf/2402.05829">pdf</a>, <a href="https://arxiv.org/format/2402.05829">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Limitations of Agents Simulated by Predictive Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Douglas%2C+R">Raymond Douglas</a>, <a href="/search/cs?searchtype=author&query=Karwowski%2C+J">Jacek Karwowski</a>, <a href="/search/cs?searchtype=author&query=Bae%2C+C">Chan Bae</a>, <a href="/search/cs?searchtype=author&query=Draguns%2C+A">Andis Draguns</a>, <a href="/search/cs?searchtype=author&query=Krakovna%2C+V">Victoria Krakovna</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.05829v1-abstract-short" style="display: inline;"> There is increasing focus on adapting predictive models into agent-like systems, most notably AI assistants based on language models. We outline two structural reasons for why these models can fail when turned into agents. First, we discuss auto-suggestive delusions. Prior work has shown theoretically that models fail to imitate agents that generated the training data if the agents relied on hidde… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.05829v1-abstract-full').style.display = 'inline'; document.getElementById('2402.05829v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.05829v1-abstract-full" style="display: none;"> There is increasing focus on adapting predictive models into agent-like systems, most notably AI assistants based on language models. We outline two structural reasons for why these models can fail when turned into agents. First, we discuss auto-suggestive delusions. Prior work has shown theoretically that models fail to imitate agents that generated the training data if the agents relied on hidden observations: the hidden observations act as confounding variables, and the models treat actions they generate as evidence for nonexistent observations. Second, we introduce and formally study a related, novel limitation: predictor-policy incoherence. When a model generates a sequence of actions, the model's implicit prediction of the policy that generated those actions can serve as a confounding variable. The result is that models choose actions as if they expect future actions to be suboptimal, causing them to be overly conservative. We show that both of those failures are fixed by including a feedback loop from the environment, that is, re-training the models on their own actions. We give simple demonstrations of both limitations using Decision Transformers and confirm that empirical results agree with our conceptual and formal analysis. Our treatment provides a unifying view of those failure modes, and informs the question of why fine-tuning offline learned policies with online learning makes them more effective. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.05829v1-abstract-full').style.display = 'none'; document.getElementById('2402.05829v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.04038">arXiv:2312.04038</a> <span> [<a href="https://arxiv.org/pdf/2312.04038">pdf</a>, <a href="https://arxiv.org/format/2312.04038">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Dynamical Systems">math.DS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Numerical Analysis">math.NA</span> </div> </div> <p class="title is-5 mathjax"> Reconstruction of dynamical systems from data without time labels </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zeng%2C+Z">Zhijun Zeng</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+P">Pipi Hu</a>, <a href="/search/cs?searchtype=author&query=Bao%2C+C">Chenglong Bao</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yi Zhu</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Z">Zuoqiang Shi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.04038v2-abstract-short" style="display: inline;"> In this paper, we study the method to reconstruct dynamical systems from data without time labels. Data without time labels appear in many applications, such as molecular dynamics, single-cell RNA sequencing etc. Reconstruction of dynamical system from time sequence data has been studied extensively. However, these methods do not apply if time labels are unknown. Without time labels, sequence data… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.04038v2-abstract-full').style.display = 'inline'; document.getElementById('2312.04038v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.04038v2-abstract-full" style="display: none;"> In this paper, we study the method to reconstruct dynamical systems from data without time labels. Data without time labels appear in many applications, such as molecular dynamics, single-cell RNA sequencing etc. Reconstruction of dynamical system from time sequence data has been studied extensively. However, these methods do not apply if time labels are unknown. Without time labels, sequence data becomes distribution data. Based on this observation, we propose to treat the data as samples from a probability distribution and try to reconstruct the underlying dynamical system by minimizing the distribution loss, sliced Wasserstein distance more specifically. Extensive experiment results demonstrate the effectiveness of the proposed method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.04038v2-abstract-full').style.display = 'none'; document.getElementById('2312.04038v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.08747">arXiv:2311.08747</a> <span> [<a href="https://arxiv.org/pdf/2311.08747">pdf</a>, <a href="https://arxiv.org/format/2311.08747">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Improved Dense Nested Attention Network Based on Transformer for Infrared Small Target Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bao%2C+C">Chun Bao</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+J">Jie Cao</a>, <a href="/search/cs?searchtype=author&query=Ning%2C+Y">Yaqian Ning</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+T">Tianhua Zhao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhijun Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zechen Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Li Zhang</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+Q">Qun Hao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.08747v3-abstract-short" style="display: inline;"> Infrared small target detection based on deep learning offers unique advantages in separating small targets from complex and dynamic backgrounds. However, the features of infrared small targets gradually weaken as the depth of convolutional neural network (CNN) increases. To address this issue, we propose a novel method for detecting infrared small targets called improved dense nested attention ne… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.08747v3-abstract-full').style.display = 'inline'; document.getElementById('2311.08747v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.08747v3-abstract-full" style="display: none;"> Infrared small target detection based on deep learning offers unique advantages in separating small targets from complex and dynamic backgrounds. However, the features of infrared small targets gradually weaken as the depth of convolutional neural network (CNN) increases. To address this issue, we propose a novel method for detecting infrared small targets called improved dense nested attention network (IDNANet), which is based on the transformer architecture. We preserve the dense nested structure of dense nested attention network (DNANet) and introduce the Swin-transformer during feature extraction stage to enhance the continuity of features. Furthermore, we integrate the ACmix attention structure into the dense nested structure to enhance the features of intermediate layers. Additionally, we design a weighted dice binary cross-entropy (WD-BCE) loss function to mitigate the negative impact of foreground-background imbalance in the samples. Moreover, we develop a dataset specifically for infrared small targets, called BIT-SIRST. The dataset comprises a significant amount of real-world targets and manually annotated labels, as well as synthetic data and corresponding labels. We have evaluated the effectiveness of our method through experiments conducted on public datasets. In comparison to other state-of-the-art methods, our approach outperforms in terms of probability of detection ($P_d$), false-alarm rate ($F_a$), and mean intersection of union ($mIoU$). The $mIoU$ reaches 90.89\% on the NUDT-SIRST dataset and 79.72\% on the SIRST dataset. The BIT-SIRST dataset and codes are available openly at \href{https://github.com/EdwardBao1006/bit\_sirst}{\color[HTML]{B22222}{https://github.com/EdwardBao1006/bit\_sirst}}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.08747v3-abstract-full').style.display = 'none'; document.getElementById('2311.08747v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.13123">arXiv:2310.13123</a> <span> [<a href="https://arxiv.org/pdf/2310.13123">pdf</a>, <a href="https://arxiv.org/format/2310.13123">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1016/j.oceaneng.2023.115271">10.1016/j.oceaneng.2023.115271 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Fuel Consumption Prediction for a Passenger Ferry using Machine Learning and In-service Data: A Comparative Study </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Agand%2C+P">Pedram Agand</a>, <a href="/search/cs?searchtype=author&query=Kennedy%2C+A">Allison Kennedy</a>, <a href="/search/cs?searchtype=author&query=Harris%2C+T">Trevor Harris</a>, <a href="/search/cs?searchtype=author&query=Bae%2C+C">Chanwoo Bae</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+M">Mo Chen</a>, <a href="/search/cs?searchtype=author&query=Park%2C+E+J">Edward J Park</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.13123v2-abstract-short" style="display: inline;"> As the importance of eco-friendly transportation increases, providing an efficient approach for marine vessel operation is essential. Methods for status monitoring with consideration to the weather condition and forecasting with the use of in-service data from ships requires accurate and complete models for predicting the energy efficiency of a ship. The models need to effectively process all the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.13123v2-abstract-full').style.display = 'inline'; document.getElementById('2310.13123v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.13123v2-abstract-full" style="display: none;"> As the importance of eco-friendly transportation increases, providing an efficient approach for marine vessel operation is essential. Methods for status monitoring with consideration to the weather condition and forecasting with the use of in-service data from ships requires accurate and complete models for predicting the energy efficiency of a ship. The models need to effectively process all the operational data in real-time. This paper presents models that can predict fuel consumption using in-service data collected from a passenger ship. Statistical and domain-knowledge methods were used to select the proper input variables for the models. These methods prevent over-fitting, missing data, and multicollinearity while providing practical applicability. Prediction models that were investigated include multiple linear regression (MLR), decision tree approach (DT), an artificial neural network (ANN), and ensemble methods. The best predictive performance was from a model developed using the XGboost technique which is a boosting ensemble approach. \rvv{Our code is available on GitHub at \url{https://github.com/pagand/model_optimze_vessel/tree/OE} for future research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.13123v2-abstract-full').style.display = 'none'; document.getElementById('2310.13123v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">20 pages, 11 figures, 7 tables</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Ocean Engineering 284 (2023): 115271 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.01361">arXiv:2310.01361</a> <span> [<a href="https://arxiv.org/pdf/2310.01361">pdf</a>, <a href="https://arxiv.org/format/2310.01361">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> GenSim: Generating Robotic Simulation Tasks via Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+L">Lirui Wang</a>, <a href="/search/cs?searchtype=author&query=Ling%2C+Y">Yiyang Ling</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Z">Zhecheng Yuan</a>, <a href="/search/cs?searchtype=author&query=Shridhar%2C+M">Mohit Shridhar</a>, <a href="/search/cs?searchtype=author&query=Bao%2C+C">Chen Bao</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+Y">Yuzhe Qin</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bailin Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+H">Huazhe Xu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiaolong Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.01361v2-abstract-short" style="display: inline;"> Collecting large amounts of real-world interaction data to train general robotic policies is often prohibitively expensive, thus motivating the use of simulation data. However, existing methods for data generation have generally focused on scene-level diversity (e.g., object instances and poses) rather than task-level diversity, due to the human effort required to come up with and verify novel tas… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.01361v2-abstract-full').style.display = 'inline'; document.getElementById('2310.01361v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.01361v2-abstract-full" style="display: none;"> Collecting large amounts of real-world interaction data to train general robotic policies is often prohibitively expensive, thus motivating the use of simulation data. However, existing methods for data generation have generally focused on scene-level diversity (e.g., object instances and poses) rather than task-level diversity, due to the human effort required to come up with and verify novel tasks. This has made it challenging for policies trained on simulation data to demonstrate significant task-level generalization. In this paper, we propose to automatically generate rich simulation environments and expert demonstrations by exploiting a large language models' (LLM) grounding and coding ability. Our approach, dubbed GenSim, has two modes: goal-directed generation, wherein a target task is given to the LLM and the LLM proposes a task curriculum to solve the target task, and exploratory generation, wherein the LLM bootstraps from previous tasks and iteratively proposes novel tasks that would be helpful in solving more complex tasks. We use GPT4 to expand the existing benchmark by ten times to over 100 tasks, on which we conduct supervised finetuning and evaluate several LLMs including finetuned GPTs and Code Llama on code generation for robotic simulation tasks. Furthermore, we observe that LLMs-generated simulation programs can enhance task-level generalization significantly when used for multitask policy training. We further find that with minimal sim-to-real adaptation, the multitask policies pretrained on GPT4-generated simulation tasks exhibit stronger transfer to unseen long-horizon tasks in the real world and outperform baselines by 25%. See the project website (https://liruiw.github.io/gensim) for code, demos, and videos. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.01361v2-abstract-full').style.display = 'none'; document.getElementById('2310.01361v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">See our project website (https://liruiw.github.io/gensim), demo and datasets (https://huggingface.co/spaces/Gen-Sim/Gen-Sim), and code (https://github.com/liruiw/GenSim) for more details</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> International Conference on Learning Representations (ICLR), 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.14954">arXiv:2309.14954</a> <span> [<a href="https://arxiv.org/pdf/2309.14954">pdf</a>, <a href="https://arxiv.org/format/2309.14954">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Biomolecules">q-bio.BM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Addressing preferred orientation in single-particle cryo-EM through AI-generated auxiliary particles </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hui Zhang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+D">Dihan Zheng</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Q">Qiurong Wu</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+N">Nieng Yan</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Z">Zuoqiang Shi</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+M">Mingxu Hu</a>, <a href="/search/cs?searchtype=author&query=Bao%2C+C">Chenglong Bao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.14954v1-abstract-short" style="display: inline;"> The single-particle cryo-EM field faces the persistent challenge of preferred orientation, lacking general computational solutions. We introduce cryoPROS, an AI-based approach designed to address the above issue. By generating the auxiliary particles with a conditional deep generative model, cryoPROS addresses the intrinsic bias in orientation estimation for the observed particles. We effectively… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.14954v1-abstract-full').style.display = 'inline'; document.getElementById('2309.14954v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.14954v1-abstract-full" style="display: none;"> The single-particle cryo-EM field faces the persistent challenge of preferred orientation, lacking general computational solutions. We introduce cryoPROS, an AI-based approach designed to address the above issue. By generating the auxiliary particles with a conditional deep generative model, cryoPROS addresses the intrinsic bias in orientation estimation for the observed particles. We effectively employed cryoPROS in the cryo-EM single particle analysis of the hemagglutinin trimer, showing the ability to restore the near-atomic resolution structure on non-tilt data. Moreover, the enhanced version named cryoPROS-MP significantly improves the resolution of the membrane protein NaX using the no-tilted data that contains the effects of micelles. Compared to the classical approaches, cryoPROS does not need special experimental or image acquisition techniques, providing a purely computational yet effective solution for the preferred orientation problem. Finally, we conduct extensive experiments that establish the low risk of model bias and the high robustness of cryoPROS. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.14954v1-abstract-full').style.display = 'none'; document.getElementById('2309.14954v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.13504">arXiv:2309.13504</a> <span> [<a href="https://arxiv.org/pdf/2309.13504">pdf</a>, <a href="https://arxiv.org/format/2309.13504">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Attention Is All You Need For Blind Room Volume Estimation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chunxi Wang</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+M">Maoshen Jia</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Meiran Li</a>, <a href="/search/cs?searchtype=author&query=Bao%2C+C">Changchun Bao</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+W">Wenyu Jin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.13504v3-abstract-short" style="display: inline;"> In recent years, dynamic parameterization of acoustic environments has raised increasing attention in the field of audio processing. One of the key parameters that characterize the local room acoustics in isolation from orientation and directivity of sources and receivers is the geometric room volume. Convolutional neural networks (CNNs) have been widely selected as the main models for conducting… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.13504v3-abstract-full').style.display = 'inline'; document.getElementById('2309.13504v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.13504v3-abstract-full" style="display: none;"> In recent years, dynamic parameterization of acoustic environments has raised increasing attention in the field of audio processing. One of the key parameters that characterize the local room acoustics in isolation from orientation and directivity of sources and receivers is the geometric room volume. Convolutional neural networks (CNNs) have been widely selected as the main models for conducting blind room acoustic parameter estimation, which aims to learn a direct mapping from audio spectrograms to corresponding labels. With the recent trend of self-attention mechanisms, this paper introduces a purely attention-based model to blindly estimate room volumes based on single-channel noisy speech signals. We demonstrate the feasibility of eliminating the reliance on CNN for this task and the proposed Transformer architecture takes Gammatone magnitude spectral coefficients and phase spectrograms as inputs. To enhance the model performance given the task-specific dataset, cross-modality transfer learning is also applied. Experimental results demonstrate that the proposed model outperforms traditional CNN models across a wide range of real-world acoustics spaces, especially with the help of the dedicated pretraining and data augmentation schemes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.13504v3-abstract-full').style.display = 'none'; document.getElementById('2309.13504v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 4 figures, to be published in proceedings of ICASSP 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.01131">arXiv:2309.01131</a> <span> [<a href="https://arxiv.org/pdf/2309.01131">pdf</a>, <a href="https://arxiv.org/format/2309.01131">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Attention Where It Matters: Rethinking Visual Document Understanding with Selective Region Concentration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cao%2C+H">Haoyu Cao</a>, <a href="/search/cs?searchtype=author&query=Bao%2C+C">Changcun Bao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+C">Chaohu Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Huang Chen</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+K">Kun Yin</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Hao Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yinsong Liu</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+D">Deqiang Jiang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+X">Xing Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.01131v1-abstract-short" style="display: inline;"> We propose a novel end-to-end document understanding model called SeRum (SElective Region Understanding Model) for extracting meaningful information from document images, including document analysis, retrieval, and office automation. Unlike state-of-the-art approaches that rely on multi-stage technical schemes and are computationally expensive, SeRum converts document image understanding and r… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.01131v1-abstract-full').style.display = 'inline'; document.getElementById('2309.01131v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.01131v1-abstract-full" style="display: none;"> We propose a novel end-to-end document understanding model called SeRum (SElective Region Understanding Model) for extracting meaningful information from document images, including document analysis, retrieval, and office automation. Unlike state-of-the-art approaches that rely on multi-stage technical schemes and are computationally expensive, SeRum converts document image understanding and recognition tasks into a local decoding process of the visual tokens of interest, using a content-aware token merge module. This mechanism enables the model to pay more attention to regions of interest generated by the query decoder, improving the model's effectiveness and speeding up the decoding speed of the generative scheme. We also designed several pre-training tasks to enhance the understanding and local awareness of the model. Experimental results demonstrate that SeRum achieves state-of-the-art performance on document understanding tasks and competitive results on text spotting tasks. SeRum represents a substantial advancement towards enabling efficient and effective end-to-end document understanding. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.01131v1-abstract-full').style.display = 'none'; document.getElementById('2309.01131v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ICCV 2023 main conference</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.03280">arXiv:2308.03280</a> <span> [<a href="https://arxiv.org/pdf/2308.03280">pdf</a>, <a href="https://arxiv.org/format/2308.03280">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3581783.3611857">10.1145/3581783.3611857 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Mirror-NeRF: Learning Neural Radiance Fields for Mirrors with Whitted-Style Ray Tracing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zeng%2C+J">Junyi Zeng</a>, <a href="/search/cs?searchtype=author&query=Bao%2C+C">Chong Bao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+R">Rui Chen</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Z">Zilong Dong</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+G">Guofeng Zhang</a>, <a href="/search/cs?searchtype=author&query=Bao%2C+H">Hujun Bao</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+Z">Zhaopeng Cui</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.03280v1-abstract-short" style="display: inline;"> Recently, Neural Radiance Fields (NeRF) has exhibited significant success in novel view synthesis, surface reconstruction, etc. However, since no physical reflection is considered in its rendering pipeline, NeRF mistakes the reflection in the mirror as a separate virtual scene, leading to the inaccurate reconstruction of the mirror and multi-view inconsistent reflections in the mirror. In this pap… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.03280v1-abstract-full').style.display = 'inline'; document.getElementById('2308.03280v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.03280v1-abstract-full" style="display: none;"> Recently, Neural Radiance Fields (NeRF) has exhibited significant success in novel view synthesis, surface reconstruction, etc. However, since no physical reflection is considered in its rendering pipeline, NeRF mistakes the reflection in the mirror as a separate virtual scene, leading to the inaccurate reconstruction of the mirror and multi-view inconsistent reflections in the mirror. In this paper, we present a novel neural rendering framework, named Mirror-NeRF, which is able to learn accurate geometry and reflection of the mirror and support various scene manipulation applications with mirrors, such as adding new objects or mirrors into the scene and synthesizing the reflections of these new objects in mirrors, controlling mirror roughness, etc. To achieve this goal, we propose a unified radiance field by introducing the reflection probability and tracing rays following the light transport model of Whitted Ray Tracing, and also develop several techniques to facilitate the learning process. Experiments and comparisons on both synthetic and real datasets demonstrate the superiority of our method. The code and supplementary material are available on the project webpage: https://zju3dv.github.io/Mirror-NeRF/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.03280v1-abstract-full').style.display = 'none'; document.getElementById('2308.03280v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ACM Multimedia 2023. Project Page: https://zju3dv.github.io/Mirror-NeRF/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2307.12333">arXiv:2307.12333</a> <span>  </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> An axiomatized PDE model of deep neural networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+T">Tangjun Wang</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+W">Wenqi Tao</a>, <a href="/search/cs?searchtype=author&query=Bao%2C+C">Chenglong Bao</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Z">Zuoqiang Shi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2307.12333v2-abstract-short" style="display: inline;"> Inspired by the relation between deep neural network (DNN) and partial differential equations (PDEs), we study the general form of the PDE models of deep neural networks. To achieve this goal, we formulate DNN as an evolution operator from a simple base model. Based on several reasonable assumptions, we prove that the evolution operator is actually determined by convection-diffusion equation. This… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.12333v2-abstract-full').style.display = 'inline'; document.getElementById('2307.12333v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2307.12333v2-abstract-full" style="display: none;"> Inspired by the relation between deep neural network (DNN) and partial differential equations (PDEs), we study the general form of the PDE models of deep neural networks. To achieve this goal, we formulate DNN as an evolution operator from a simple base model. Based on several reasonable assumptions, we prove that the evolution operator is actually determined by convection-diffusion equation. This convection-diffusion equation model gives mathematical explanation for several effective networks. Moreover, we show that the convection-diffusion model improves the robustness and reduces the Rademacher complexity. Based on the convection-diffusion equation, we design a new training method for ResNets. Experiments validate the performance of the proposed method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.12333v2-abstract-full').style.display = 'none'; document.getElementById('2307.12333v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The experiment design in the paper lacks careful thought and may be misleading in demonstrating our contribution</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2307.01676">arXiv:2307.01676</a> <span> [<a href="https://arxiv.org/pdf/2307.01676">pdf</a>, <a href="https://arxiv.org/format/2307.01676">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> RaidEnv: Exploring New Challenges in Automated Content Balancing for Boss Raid Games </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jeon%2C+H">Hyeon-Chang Jeon</a>, <a href="/search/cs?searchtype=author&query=Baek%2C+I">In-Chang Baek</a>, <a href="/search/cs?searchtype=author&query=Bae%2C+C">Cheong-mok Bae</a>, <a href="/search/cs?searchtype=author&query=Park%2C+T">Taehwa Park</a>, <a href="/search/cs?searchtype=author&query=You%2C+W">Wonsang You</a>, <a href="/search/cs?searchtype=author&query=Ha%2C+T">Taegwan Ha</a>, <a href="/search/cs?searchtype=author&query=Jung%2C+H">Hoyun Jung</a>, <a href="/search/cs?searchtype=author&query=Noh%2C+J">Jinha Noh</a>, <a href="/search/cs?searchtype=author&query=Oh%2C+S">Seungwon Oh</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+K">Kyung-Joong Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2307.01676v1-abstract-short" style="display: inline;"> The balance of game content significantly impacts the gaming experience. Unbalanced game content diminishes engagement or increases frustration because of repetitive failure. Although game designers intend to adjust the difficulty of game content, this is a repetitive, labor-intensive, and challenging process, especially for commercial-level games with extensive content. To address this issue, the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.01676v1-abstract-full').style.display = 'inline'; document.getElementById('2307.01676v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2307.01676v1-abstract-full" style="display: none;"> The balance of game content significantly impacts the gaming experience. Unbalanced game content diminishes engagement or increases frustration because of repetitive failure. Although game designers intend to adjust the difficulty of game content, this is a repetitive, labor-intensive, and challenging process, especially for commercial-level games with extensive content. To address this issue, the game research community has explored automated game balancing using artificial intelligence (AI) techniques. However, previous studies have focused on limited game content and did not consider the importance of the generalization ability of playtesting agents when encountering content changes. In this study, we propose RaidEnv, a new game simulator that includes diverse and customizable content for the boss raid scenario in MMORPG games. Additionally, we design two benchmarks for the boss raid scenario that can aid in the practical application of game AI. These benchmarks address two open problems in automatic content balancing, and we introduce two evaluation metrics to provide guidance for AI in automatic content balancing. This novel game research platform expands the frontiers of automatic game balancing problems and offers a framework within a realistic game production pipeline. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.01676v1-abstract-full').style.display = 'none'; document.getElementById('2307.01676v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">14 pages, 6 figures, 6 tables, 2 algorithms</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.16309">arXiv:2306.16309</a> <span> [<a href="https://arxiv.org/pdf/2306.16309">pdf</a>, <a href="https://arxiv.org/format/2306.16309">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> </div> </div> <p class="title is-5 mathjax"> Raphtory: The temporal graph engine for Rust and Python </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Steer%2C+B">Ben Steer</a>, <a href="/search/cs?searchtype=author&query=Arnold%2C+N">Naomi Arnold</a>, <a href="/search/cs?searchtype=author&query=Ba%2C+C+T">Cheick Tidiane Ba</a>, <a href="/search/cs?searchtype=author&query=Lambiotte%2C+R">Renaud Lambiotte</a>, <a href="/search/cs?searchtype=author&query=Yousaf%2C+H">Haaroon Yousaf</a>, <a href="/search/cs?searchtype=author&query=Jeub%2C+L">Lucas Jeub</a>, <a href="/search/cs?searchtype=author&query=Murariu%2C+F">Fabian Murariu</a>, <a href="/search/cs?searchtype=author&query=Kapoor%2C+S">Shivam Kapoor</a>, <a href="/search/cs?searchtype=author&query=Rico%2C+P">Pedro Rico</a>, <a href="/search/cs?searchtype=author&query=Chan%2C+R">Rachel Chan</a>, <a href="/search/cs?searchtype=author&query=Chan%2C+L">Louis Chan</a>, <a href="/search/cs?searchtype=author&query=Alford%2C+J">James Alford</a>, <a href="/search/cs?searchtype=author&query=Clegg%2C+R+G">Richard G. Clegg</a>, <a href="/search/cs?searchtype=author&query=Cuadrado%2C+F">Felix Cuadrado</a>, <a href="/search/cs?searchtype=author&query=Barnes%2C+M+R">Matthew Russell Barnes</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+P">Peijie Zhong</a>, <a href="/search/cs?searchtype=author&query=Biyong%2C+J+N+P">John N. Pougu茅 Biyong</a>, <a href="/search/cs?searchtype=author&query=Alnaimi%2C+A">Alhamza Alnaimi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.16309v2-abstract-short" style="display: inline;"> Raphtory is a platform for building and analysing temporal networks. The library includes methods for creating networks from a variety of data sources; algorithms to explore their structure and evolution; and an extensible GraphQL server for deployment of applications built on top. Raphtory's core engine is built in Rust, for efficiency, with Python interfaces, for ease of use. Raphtory is develop… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.16309v2-abstract-full').style.display = 'inline'; document.getElementById('2306.16309v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.16309v2-abstract-full" style="display: none;"> Raphtory is a platform for building and analysing temporal networks. The library includes methods for creating networks from a variety of data sources; algorithms to explore their structure and evolution; and an extensible GraphQL server for deployment of applications built on top. Raphtory's core engine is built in Rust, for efficiency, with Python interfaces, for ease of use. Raphtory is developed by network scientists, with a background in Physics, Applied Mathematics, Engineering and Computer Science, for use across academia and industry. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.16309v2-abstract-full').style.display = 'none'; document.getElementById('2306.16309v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.12113">arXiv:2306.12113</a> <span> [<a href="https://arxiv.org/pdf/2306.12113">pdf</a>, <a href="https://arxiv.org/format/2306.12113">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Lightweight wood panel defect detection method incorporating attention mechanism and feature fusion network </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cao%2C+Y">Yongxin Cao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+F">Fanghua Liu</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+L">Lai Jiang</a>, <a href="/search/cs?searchtype=author&query=Bao%2C+C">Cheng Bao</a>, <a href="/search/cs?searchtype=author&query=Miao%2C+Y">You Miao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yang Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.12113v1-abstract-short" style="display: inline;"> In recent years, deep learning has made significant progress in wood panel defect detection. However, there are still challenges such as low detection , slow detection speed, and difficulties in deploying embedded devices on wood panel surfaces. To overcome these issues, we propose a lightweight wood panel defect detection method called YOLOv5-LW, which incorporates attention mechanisms and a feat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.12113v1-abstract-full').style.display = 'inline'; document.getElementById('2306.12113v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.12113v1-abstract-full" style="display: none;"> In recent years, deep learning has made significant progress in wood panel defect detection. However, there are still challenges such as low detection , slow detection speed, and difficulties in deploying embedded devices on wood panel surfaces. To overcome these issues, we propose a lightweight wood panel defect detection method called YOLOv5-LW, which incorporates attention mechanisms and a feature fusion network.Firstly, to enhance the detection capability of acceptable defects, we introduce the Multi-scale Bi-directional Feature Pyramid Network (MBiFPN) as a feature fusion network. The MBiFPN reduces feature loss, enriches local and detailed features, and improves the model's detection capability for acceptable defects.Secondly, to achieve a lightweight design, we reconstruct the ShuffleNetv2 network model as the backbone network. This reconstruction reduces the number of parameters and computational requirements while maintaining performance. We also introduce the Stem Block and Spatial Pyramid Pooling Fast (SPPF) models to compensate for any accuracy loss resulting from the lightweight design, ensuring the model's detection capabilities remain intact while being computationally efficient.Thirdly, we enhance the backbone network by incorporating Efficient Channel Attention (ECA), which improves the network's focus on key information relevant to defect detection. By attending to essential features, the model becomes more proficient in accurately identifying and localizing defects.We validate the proposed method using a self-developed wood panel defect dataset.The experimental results demonstrate the effectiveness of the improved YOLOv5-LW method. Compared to the original model, our approach achieves a 92.8\% accuracy rate, reduces the number of parameters by 27.78\%, compresses computational volume by 41.25\%, improves detection inference speed by 10.16\% <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.12113v1-abstract-full').style.display = 'none'; document.getElementById('2306.12113v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.05706">arXiv:2305.05706</a> <span> [<a href="https://arxiv.org/pdf/2305.05706">pdf</a>, <a href="https://arxiv.org/format/2305.05706">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> DexArt: Benchmarking Generalizable Dexterous Manipulation with Articulated Objects </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bao%2C+C">Chen Bao</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+H">Helin Xu</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+Y">Yuzhe Qin</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiaolong Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.05706v1-abstract-short" style="display: inline;"> To enable general-purpose robots, we will require the robot to operate daily articulated objects as humans do. Current robot manipulation has heavily relied on using a parallel gripper, which restricts the robot to a limited set of objects. On the other hand, operating with a multi-finger robot hand will allow better approximation to human behavior and enable the robot to operate on diverse articu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.05706v1-abstract-full').style.display = 'inline'; document.getElementById('2305.05706v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.05706v1-abstract-full" style="display: none;"> To enable general-purpose robots, we will require the robot to operate daily articulated objects as humans do. Current robot manipulation has heavily relied on using a parallel gripper, which restricts the robot to a limited set of objects. On the other hand, operating with a multi-finger robot hand will allow better approximation to human behavior and enable the robot to operate on diverse articulated objects. To this end, we propose a new benchmark called DexArt, which involves Dexterous manipulation with Articulated objects in a physical simulator. In our benchmark, we define multiple complex manipulation tasks, and the robot hand will need to manipulate diverse articulated objects within each task. Our main focus is to evaluate the generalizability of the learned policy on unseen articulated objects. This is very challenging given the high degrees of freedom of both hands and objects. We use Reinforcement Learning with 3D representation learning to achieve generalization. Through extensive studies, we provide new insights into how 3D representation learning affects decision making in RL with 3D point cloud inputs. More details can be found at https://www.chenbao.tech/dexart/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.05706v1-abstract-full').style.display = 'none'; document.getElementById('2305.05706v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to CVPR 2023. Project page: https://www.chenbao.tech/dexart/ Equal contributors: Chen Bao, Helin Xu</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2304.01603">arXiv:2304.01603</a> <span> [<a href="https://arxiv.org/pdf/2304.01603">pdf</a>, <a href="https://arxiv.org/format/2304.01603">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Locate Then Generate: Bridging Vision and Language with Bounding Box for Scene-Text VQA </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yongxin Zhu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhen Liu</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+Y">Yukang Liang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xin Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Hao Liu</a>, <a href="/search/cs?searchtype=author&query=Bao%2C+C">Changcun Bao</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+L">Linli Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2304.01603v1-abstract-short" style="display: inline;"> In this paper, we propose a novel multi-modal framework for Scene Text Visual Question Answering (STVQA), which requires models to read scene text in images for question answering. Apart from text or visual objects, which could exist independently, scene text naturally links text and visual modalities together by conveying linguistic semantics while being a visual object in an image simultaneously… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.01603v1-abstract-full').style.display = 'inline'; document.getElementById('2304.01603v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2304.01603v1-abstract-full" style="display: none;"> In this paper, we propose a novel multi-modal framework for Scene Text Visual Question Answering (STVQA), which requires models to read scene text in images for question answering. Apart from text or visual objects, which could exist independently, scene text naturally links text and visual modalities together by conveying linguistic semantics while being a visual object in an image simultaneously. Different to conventional STVQA models which take the linguistic semantics and visual semantics in scene text as two separate features, in this paper, we propose a paradigm of "Locate Then Generate" (LTG), which explicitly unifies this two semantics with the spatial bounding box as a bridge connecting them. Specifically, at first, LTG locates the region in an image that may contain the answer words with an answer location module (ALM) consisting of a region proposal network and a language refinement network, both of which can transform to each other with one-to-one mapping via the scene text bounding box. Next, given the answer words selected by ALM, LTG generates a readable answer sequence with an answer generation module (AGM) based on a pre-trained language model. As a benefit of the explicit alignment of the visual and linguistic semantics, even without any scene text based pre-training tasks, LTG can boost the absolute accuracy by +6.06% and +6.92% on the TextVQA dataset and the ST-VQA dataset respectively, compared with a non-pre-training baseline. We further demonstrate that LTG effectively unifies visual and text modalities through the spatial bounding box connection, which is underappreciated in previous methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.01603v1-abstract-full').style.display = 'none'; document.getElementById('2304.01603v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 April, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted in AAAI 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2303.15710">arXiv:2303.15710</a> <span> [<a href="https://arxiv.org/pdf/2303.15710">pdf</a>, <a href="https://arxiv.org/format/2303.15710">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Explicit Attention-Enhanced Fusion for RGB-Thermal Perception Tasks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liang%2C+M">Mingjian Liang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+J">Junjie Hu</a>, <a href="/search/cs?searchtype=author&query=Bao%2C+C">Chenyu Bao</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+H">Hua Feng</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+F">Fuqin Deng</a>, <a href="/search/cs?searchtype=author&query=Lam%2C+T+L">Tin Lun Lam</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2303.15710v1-abstract-short" style="display: inline;"> Recently, RGB-Thermal based perception has shown significant advances. Thermal information provides useful clues when visual cameras suffer from poor lighting conditions, such as low light and fog. However, how to effectively fuse RGB images and thermal data remains an open challenge. Previous works involve naive fusion strategies such as merging them at the input, concatenating multi-modality fea… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.15710v1-abstract-full').style.display = 'inline'; document.getElementById('2303.15710v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2303.15710v1-abstract-full" style="display: none;"> Recently, RGB-Thermal based perception has shown significant advances. Thermal information provides useful clues when visual cameras suffer from poor lighting conditions, such as low light and fog. However, how to effectively fuse RGB images and thermal data remains an open challenge. Previous works involve naive fusion strategies such as merging them at the input, concatenating multi-modality features inside models, or applying attention to each data modality. These fusion strategies are straightforward yet insufficient. In this paper, we propose a novel fusion method named Explicit Attention-Enhanced Fusion (EAEF) that fully takes advantage of each type of data. Specifically, we consider the following cases: i) both RGB data and thermal data, ii) only one of the types of data, and iii) none of them generate discriminative features. EAEF uses one branch to enhance feature extraction for i) and iii) and the other branch to remedy insufficient representations for ii). The outputs of two branches are fused to form complementary features. As a result, the proposed fusion method outperforms state-of-the-art by 1.6\% in mIoU on semantic segmentation, 3.1\% in MAE on salient object detection, 2.3\% in mAP on object detection, and 8.1\% in MAE on crowd counting. The code is available at https://github.com/FreeformRobotics/EAEFNet. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.15710v1-abstract-full').style.display = 'none'; document.getElementById('2303.15710v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2303.13277">arXiv:2303.13277</a> <span> [<a href="https://arxiv.org/pdf/2303.13277">pdf</a>, <a href="https://arxiv.org/format/2303.13277">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SINE: Semantic-driven Image-based NeRF Editing with Prior-guided Editing Field </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bao%2C+C">Chong Bao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yinda Zhang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+B">Bangbang Yang</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+T">Tianxing Fan</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zesong Yang</a>, <a href="/search/cs?searchtype=author&query=Bao%2C+H">Hujun Bao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+G">Guofeng Zhang</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+Z">Zhaopeng Cui</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2303.13277v2-abstract-short" style="display: inline;"> Despite the great success in 2D editing using user-friendly tools, such as Photoshop, semantic strokes, or even text prompts, similar capabilities in 3D areas are still limited, either relying on 3D modeling skills or allowing editing within only a few categories. In this paper, we present a novel semantic-driven NeRF editing approach, which enables users to edit a neural radiance field with a sin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.13277v2-abstract-full').style.display = 'inline'; document.getElementById('2303.13277v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2303.13277v2-abstract-full" style="display: none;"> Despite the great success in 2D editing using user-friendly tools, such as Photoshop, semantic strokes, or even text prompts, similar capabilities in 3D areas are still limited, either relying on 3D modeling skills or allowing editing within only a few categories. In this paper, we present a novel semantic-driven NeRF editing approach, which enables users to edit a neural radiance field with a single image, and faithfully delivers edited novel views with high fidelity and multi-view consistency. To achieve this goal, we propose a prior-guided editing field to encode fine-grained geometric and texture editing in 3D space, and develop a series of techniques to aid the editing process, including cyclic constraints with a proxy mesh to facilitate geometric supervision, a color compositing mechanism to stabilize semantic-driven texture editing, and a feature-cluster-based regularization to preserve the irrelevant content unchanged. Extensive experiments and editing examples on both real-world and synthetic data demonstrate that our method achieves photo-realistic 3D editing using only a single edited image, pushing the bound of semantic-driven editing in 3D real-world scenes. Our project webpage: https://zju3dv.github.io/sine/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.13277v2-abstract-full').style.display = 'none'; document.getElementById('2303.13277v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to CVPR 2023. Project Page: https://zju3dv.github.io/sine/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2302.10544">arXiv:2302.10544</a> <span> [<a href="https://arxiv.org/pdf/2302.10544">pdf</a>, <a href="https://arxiv.org/format/2302.10544">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/TCSVT.2023.3285479">10.1109/TCSVT.2023.3285479 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> EC-SfM: Efficient Covisibility-based Structure-from-Motion for Both Sequential and Unordered Images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ye%2C+Z">Zhichao Ye</a>, <a href="/search/cs?searchtype=author&query=Bao%2C+C">Chong Bao</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+X">Xin Zhou</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Haomin Liu</a>, <a href="/search/cs?searchtype=author&query=Bao%2C+H">Hujun Bao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+G">Guofeng Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2302.10544v2-abstract-short" style="display: inline;"> Structure-from-Motion is a technology used to obtain scene structure through image collection, which is a fundamental problem in computer vision. For unordered Internet images, SfM is very slow due to the lack of prior knowledge about image overlap. For sequential images, knowing the large overlap between adjacent frames, SfM can adopt a variety of acceleration strategies, which are only applicabl… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.10544v2-abstract-full').style.display = 'inline'; document.getElementById('2302.10544v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2302.10544v2-abstract-full" style="display: none;"> Structure-from-Motion is a technology used to obtain scene structure through image collection, which is a fundamental problem in computer vision. For unordered Internet images, SfM is very slow due to the lack of prior knowledge about image overlap. For sequential images, knowing the large overlap between adjacent frames, SfM can adopt a variety of acceleration strategies, which are only applicable to sequential data. To further improve the reconstruction efficiency and break the gap of strategies between these two kinds of data, this paper presents an efficient covisibility-based incremental SfM. Different from previous methods, we exploit covisibility and registration dependency to describe the image connection which is suitable to any kind of data. Based on this general image connection, we propose a unified framework to efficiently reconstruct sequential images, unordered images, and the mixture of these two. Experiments on the unordered images and mixed data verify the effectiveness of the proposed method, which is three times faster than the state of the art on feature matching, and an order of magnitude faster on reconstruction without sacrificing the accuracy. The source code is publicly available at https://github.com/openxrlab/xrsfm <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.10544v2-abstract-full').style.display = 'none'; document.getElementById('2302.10544v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 February, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted 27 May 2023 (TCSVT)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2211.12698">arXiv:2211.12698</a> <span> [<a href="https://arxiv.org/pdf/2211.12698">pdf</a>, <a href="https://arxiv.org/format/2211.12698">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/LGRS.2023.3270186">10.1109/LGRS.2023.3270186 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Rega-Net:Retina Gabor Attention for Deep Convolutional Neural Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bao%2C+C">Chun Bao</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+J">Jie Cao</a>, <a href="/search/cs?searchtype=author&query=Ning%2C+Y">Yaqian Ning</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+Y">Yang Cheng</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+Q">Qun Hao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2211.12698v2-abstract-short" style="display: inline;"> Extensive research works demonstrate that the attention mechanism in convolutional neural networks (CNNs) effectively improves accuracy. Nevertheless, few works design attention mechanisms using large receptive fields. In this work, we propose a novel attention method named Rega-net to increase CNN accuracy by enlarging the receptive field. Inspired by the mechanism of the human retina, we design… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.12698v2-abstract-full').style.display = 'inline'; document.getElementById('2211.12698v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2211.12698v2-abstract-full" style="display: none;"> Extensive research works demonstrate that the attention mechanism in convolutional neural networks (CNNs) effectively improves accuracy. Nevertheless, few works design attention mechanisms using large receptive fields. In this work, we propose a novel attention method named Rega-net to increase CNN accuracy by enlarging the receptive field. Inspired by the mechanism of the human retina, we design convolutional kernels to resemble the non-uniformly distributed structure of the human retina. Then, we sample variable-resolution values in the Gabor function distribution and fill these values in retina-like kernels. This distribution allows essential features to be more visible in the center position of the receptive field. We further design an attention module including these retina-like kernels. Experiments demonstrate that our Rega-Net achieves 79.96% top-1 accuracy on ImageNet-1K classification and 43.1% mAP on COCO2017 object detection. The mAP of the Rega-Net increased by up to 3.5% compared to baseline networks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.12698v2-abstract-full').style.display = 'none'; document.getElementById('2211.12698v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2211.11248">arXiv:2211.11248</a> <span> [<a href="https://arxiv.org/pdf/2211.11248">pdf</a>, <a href="https://arxiv.org/format/2211.11248">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Video Background Music Generation: Dataset, Method and Evaluation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhuo%2C+L">Le Zhuo</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhaokai Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Baisen Wang</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+Y">Yue Liao</a>, <a href="/search/cs?searchtype=author&query=Bao%2C+C">Chenxi Bao</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+S">Stanley Peng</a>, <a href="/search/cs?searchtype=author&query=Han%2C+S">Songhao Han</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+A">Aixi Zhang</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+F">Fei Fang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Si Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2211.11248v2-abstract-short" style="display: inline;"> Music is essential when editing videos, but selecting music manually is difficult and time-consuming. Thus, we seek to automatically generate background music tracks given video input. This is a challenging task since it requires music-video datasets, efficient architectures for video-to-music generation, and reasonable metrics, none of which currently exist. To close this gap, we introduce a comp… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.11248v2-abstract-full').style.display = 'inline'; document.getElementById('2211.11248v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2211.11248v2-abstract-full" style="display: none;"> Music is essential when editing videos, but selecting music manually is difficult and time-consuming. Thus, we seek to automatically generate background music tracks given video input. This is a challenging task since it requires music-video datasets, efficient architectures for video-to-music generation, and reasonable metrics, none of which currently exist. To close this gap, we introduce a complete recipe including dataset, benchmark model, and evaluation metric for video background music generation. We present SymMV, a video and symbolic music dataset with various musical annotations. To the best of our knowledge, it is the first video-music dataset with rich musical annotations. We also propose a benchmark video background music generation framework named V-MusProd, which utilizes music priors of chords, melody, and accompaniment along with video-music relations of semantic, color, and motion features. To address the lack of objective metrics for video-music correspondence, we design a retrieval-based metric VMCP built upon a powerful video-music representation learning model. Experiments show that with our dataset, V-MusProd outperforms the state-of-the-art method in both music quality and correspondence with videos. We believe our dataset, benchmark model, and evaluation metric will boost the development of video background music generation. Our dataset and code are available at https://github.com/zhuole1025/SymMV. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.11248v2-abstract-full').style.display = 'none'; document.getElementById('2211.11248v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICCV2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2210.15128">arXiv:2210.15128</a> <span> [<a href="https://arxiv.org/pdf/2210.15128">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1007/s11042-022-13648-8">10.1007/s11042-022-13648-8 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> MMFL-Net: Multi-scale and Multi-granularity Feature Learning for Cross-domain Fashion Retrieval </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bao%2C+C">Chen Bao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xudong Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jiazhou Chen</a>, <a href="/search/cs?searchtype=author&query=Miao%2C+Y">Yongwei Miao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2210.15128v1-abstract-short" style="display: inline;"> Instance-level image retrieval in fashion is a challenging issue owing to its increasing importance in real-scenario visual fashion search. Cross-domain fashion retrieval aims to match the unconstrained customer images as queries for photographs provided by retailers; however, it is a difficult task due to a wide range of consumer-to-shop (C2S) domain discrepancies and also considering that clothi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.15128v1-abstract-full').style.display = 'inline'; document.getElementById('2210.15128v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2210.15128v1-abstract-full" style="display: none;"> Instance-level image retrieval in fashion is a challenging issue owing to its increasing importance in real-scenario visual fashion search. Cross-domain fashion retrieval aims to match the unconstrained customer images as queries for photographs provided by retailers; however, it is a difficult task due to a wide range of consumer-to-shop (C2S) domain discrepancies and also considering that clothing image is vulnerable to various non-rigid deformations. To this end, we propose a novel multi-scale and multi-granularity feature learning network (MMFL-Net), which can jointly learn global-local aggregation feature representations of clothing images in a unified framework, aiming to train a cross-domain model for C2S fashion visual similarity. First, a new semantic-spatial feature fusion part is designed to bridge the semantic-spatial gap by applying top-down and bottom-up bidirectional multi-scale feature fusion. Next, a multi-branch deep network architecture is introduced to capture global salient, part-informed, and local detailed information, and extracting robust and discrimination feature embedding by integrating the similarity learning of coarse-to-fine embedding with the multiple granularities. Finally, the improved trihard loss, center loss, and multi-task classification loss are adopted for our MMFL-Net, which can jointly optimize intra-class and inter-class distance and thus explicitly improve intra-class compactness and inter-class discriminability between its visual representations for feature learning. Furthermore, our proposed model also combines the multi-task attribute recognition and classification module with multi-label semantic attributes and product ID labels. Experimental results demonstrate that our proposed MMFL-Net achieves significant improvement over the state-of-the-art methods on the two datasets, DeepFashion-C2S and Street2Shop. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.15128v1-abstract-full').style.display = 'none'; document.getElementById('2210.15128v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">27 pages, 12 figures, Published by <Multimedia Tools and Applications></span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Multimedia Tools and Applications(2022)1-27 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2210.10290">arXiv:2210.10290</a> <span> [<a href="https://arxiv.org/pdf/2210.10290">pdf</a>, <a href="https://arxiv.org/format/2210.10290">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Differentiable Self-Adaptive Learning Rate </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+B">Bozhou Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hongzhi Wang</a>, <a href="/search/cs?searchtype=author&query=Ba%2C+C">Chenmin Ba</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2210.10290v1-abstract-short" style="display: inline;"> Learning rate adaptation is a popular topic in machine learning. Gradient Descent trains neural nerwork with a fixed learning rate. Learning rate adaptation is proposed to accelerate the training process through adjusting the step size in the training session. Famous works include Momentum, Adam and Hypergradient. Hypergradient is the most special one. Hypergradient achieved adaptation by calculat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.10290v1-abstract-full').style.display = 'inline'; document.getElementById('2210.10290v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2210.10290v1-abstract-full" style="display: none;"> Learning rate adaptation is a popular topic in machine learning. Gradient Descent trains neural nerwork with a fixed learning rate. Learning rate adaptation is proposed to accelerate the training process through adjusting the step size in the training session. Famous works include Momentum, Adam and Hypergradient. Hypergradient is the most special one. Hypergradient achieved adaptation by calculating the derivative of learning rate with respect to cost function and utilizing gradient descent for learning rate. However, Hypergradient is still not perfect. In practice, Hypergradient fail to decrease training loss after learning rate adaptation with a large probability. Apart from that, evidence has been found that Hypergradient are not suitable for dealing with large datesets in the form of minibatch training. Most unfortunately, Hypergradient always fails to get a good accuracy on the validation dataset although it could reduce training loss to a very tiny value. To solve Hypergradient's problems, we propose a novel adaptation algorithm, where learning rate is parameter specific and internal structured. We conduct extensive experiments on multiple network models and datasets compared with various benchmark optimizers. It is shown that our algorithm can achieve faster and higher qualified convergence than those state-of-art optimizers. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.10290v1-abstract-full').style.display = 'none'; document.getElementById('2210.10290v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2210.00647">arXiv:2210.00647</a> <span> [<a href="https://arxiv.org/pdf/2210.00647">pdf</a>, <a href="https://arxiv.org/format/2210.00647">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> IntrinsicNeRF: Learning Intrinsic Neural Radiance Fields for Editable Novel View Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ye%2C+W">Weicai Ye</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Shuo Chen</a>, <a href="/search/cs?searchtype=author&query=Bao%2C+C">Chong Bao</a>, <a href="/search/cs?searchtype=author&query=Bao%2C+H">Hujun Bao</a>, <a href="/search/cs?searchtype=author&query=Pollefeys%2C+M">Marc Pollefeys</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+Z">Zhaopeng Cui</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+G">Guofeng Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2210.00647v3-abstract-short" style="display: inline;"> Existing inverse rendering combined with neural rendering methods can only perform editable novel view synthesis on object-specific scenes, while we present intrinsic neural radiance fields, dubbed IntrinsicNeRF, which introduce intrinsic decomposition into the NeRF-based neural rendering method and can extend its application to room-scale scenes. Since intrinsic decomposition is a fundamentally u… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.00647v3-abstract-full').style.display = 'inline'; document.getElementById('2210.00647v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2210.00647v3-abstract-full" style="display: none;"> Existing inverse rendering combined with neural rendering methods can only perform editable novel view synthesis on object-specific scenes, while we present intrinsic neural radiance fields, dubbed IntrinsicNeRF, which introduce intrinsic decomposition into the NeRF-based neural rendering method and can extend its application to room-scale scenes. Since intrinsic decomposition is a fundamentally under-constrained inverse problem, we propose a novel distance-aware point sampling and adaptive reflectance iterative clustering optimization method, which enables IntrinsicNeRF with traditional intrinsic decomposition constraints to be trained in an unsupervised manner, resulting in multi-view consistent intrinsic decomposition results. To cope with the problem that different adjacent instances of similar reflectance in a scene are incorrectly clustered together, we further propose a hierarchical clustering method with coarse-to-fine optimization to obtain a fast hierarchical indexing representation. It supports compelling real-time augmented applications such as recoloring and illumination variation. Extensive experiments and editing samples on both object-specific/room-scale scenes and synthetic/real-word data demonstrate that we can obtain consistent intrinsic decomposition results and high-fidelity novel view synthesis even for challenging sequences. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.00647v3-abstract-full').style.display = 'none'; document.getElementById('2210.00647v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ICCV2023, Project webpage: https://zju3dv.github.io/intrinsic_nerf/, code: https://github.com/zju3dv/IntrinsicNeRF</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Report number:</span> 25 pages </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2209.02513">arXiv:2209.02513</a> <span> [<a href="https://arxiv.org/pdf/2209.02513">pdf</a>, <a href="https://arxiv.org/format/2209.02513">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Semi-Supervised Clustering via Dynamic Graph Structure Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ling%2C+H">Huaming Ling</a>, <a href="/search/cs?searchtype=author&query=Bao%2C+C">Chenglong Bao</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+X">Xin Liang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Z">Zuoqiang Shi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2209.02513v1-abstract-short" style="display: inline;"> Most existing semi-supervised graph-based clustering methods exploit the supervisory information by either refining the affinity matrix or directly constraining the low-dimensional representations of data points. The affinity matrix represents the graph structure and is vital to the performance of semi-supervised graph-based clustering. However, existing methods adopt a static affinity matrix to l… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2209.02513v1-abstract-full').style.display = 'inline'; document.getElementById('2209.02513v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2209.02513v1-abstract-full" style="display: none;"> Most existing semi-supervised graph-based clustering methods exploit the supervisory information by either refining the affinity matrix or directly constraining the low-dimensional representations of data points. The affinity matrix represents the graph structure and is vital to the performance of semi-supervised graph-based clustering. However, existing methods adopt a static affinity matrix to learn the low-dimensional representations of data points and do not optimize the affinity matrix during the learning process. In this paper, we propose a novel dynamic graph structure learning method for semi-supervised clustering. In this method, we simultaneously optimize the affinity matrix and the low-dimensional representations of data points by leveraging the given pairwise constraints. Moreover, we propose an alternating minimization approach with proven convergence to solve the proposed nonconvex model. During the iteration process, our method cyclically updates the low-dimensional representations of data points and refines the affinity matrix, leading to a dynamic affinity matrix (graph structure). Specifically, for the update of the affinity matrix, we enforce the data points with remarkably different low-dimensional representations to have an affinity value of 0. Furthermore, we construct the initial affinity matrix by integrating the local distance and global self-representation among data points. Experimental results on eight benchmark datasets under different settings show the advantages of the proposed approach. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2209.02513v1-abstract-full').style.display = 'none'; document.getElementById('2209.02513v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 September, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2208.14318">arXiv:2208.14318</a> <span> [<a href="https://arxiv.org/pdf/2208.14318">pdf</a>, <a href="https://arxiv.org/format/2208.14318">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> </div> </div> <p class="title is-5 mathjax"> Convergence Rates of Training Deep Neural Networks via Alternating Minimization Methods </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jintao Xu</a>, <a href="/search/cs?searchtype=author&query=Bao%2C+C">Chenglong Bao</a>, <a href="/search/cs?searchtype=author&query=Xing%2C+W">Wenxun Xing</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2208.14318v2-abstract-short" style="display: inline;"> Training deep neural networks (DNNs) is an important and challenging optimization problem in machine learning due to its non-convexity and non-separable structure. The alternating minimization (AM) approaches split the composition structure of DNNs and have drawn great interest in the deep learning and optimization communities. In this paper, we propose a unified framework for analyzing the conver… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2208.14318v2-abstract-full').style.display = 'inline'; document.getElementById('2208.14318v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2208.14318v2-abstract-full" style="display: none;"> Training deep neural networks (DNNs) is an important and challenging optimization problem in machine learning due to its non-convexity and non-separable structure. The alternating minimization (AM) approaches split the composition structure of DNNs and have drawn great interest in the deep learning and optimization communities. In this paper, we propose a unified framework for analyzing the convergence rate of AM-type network training methods. Our analysis is based on the non-monotone $j$-step sufficient decrease conditions and the Kurdyka-Lojasiewicz (KL) property, which relaxes the requirement of designing descent algorithms. We show the detailed local convergence rate if the KL exponent $胃$ varies in $[0,1)$. Moreover, the local R-linear convergence is discussed under a stronger $j$-step sufficient decrease condition. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2208.14318v2-abstract-full').style.display = 'none'; document.getElementById('2208.14318v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 April, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 August, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 49M37; 90C26; 90C52 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2208.12983">arXiv:2208.12983</a> <span> [<a href="https://arxiv.org/pdf/2208.12983">pdf</a>, <a href="https://arxiv.org/format/2208.12983">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> BlueTiSCH: A Multi-PHY Simulation of Low-Power 6TiSCH IoT Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bae%2C+C">Chloe Bae</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Shiwen Yang</a>, <a href="/search/cs?searchtype=author&query=Baddeley%2C+M">Michael Baddeley</a>, <a href="/search/cs?searchtype=author&query=Elsts%2C+A">Atis Elsts</a>, <a href="/search/cs?searchtype=author&query=Haque%2C+I">Israat Haque</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2208.12983v1-abstract-short" style="display: inline;"> Low-power wireless IoT networks have traditionally operated over a single physical layer (PHY) -- many based on the IEEE 802.15.4 standard. However, recent low-power wireless chipsets offer both the IEEE 802.15.4 and all four PHYs of the Bluetooth 5 (BT 5) standard. This introduces the intriguing possibility that IoT solutions might not necessarily be bound by the limits of a single PHY, and could… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2208.12983v1-abstract-full').style.display = 'inline'; document.getElementById('2208.12983v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2208.12983v1-abstract-full" style="display: none;"> Low-power wireless IoT networks have traditionally operated over a single physical layer (PHY) -- many based on the IEEE 802.15.4 standard. However, recent low-power wireless chipsets offer both the IEEE 802.15.4 and all four PHYs of the Bluetooth 5 (BT 5) standard. This introduces the intriguing possibility that IoT solutions might not necessarily be bound by the limits of a single PHY, and could actively or proactively adapt their PHY depending on RF or networking conditions (e.g., to offer a higher throughput or a longer radio range). Several recent studies have explored such use-cases. However, these studies lack comprehensive evaluation over various metrics (such as reliability, latency, and energy) with regards to scalability and the Radio Frequency (RF) environment. In this work we evaluate the performance of IEEE 802.15.4 and the four BT 5 2.4GHz PHY options for the recently completed IETF 6TiSCH low-power wireless standard. To the best of our knowledge, this is the first work to directly compare these PHYs in identical settings. Specifically, we use a recently released 6TiSCH simulator, TSCH-Sim, to compare these PHY options in networks of up to 250 nodes over different RF environments (home, industrial, and outdoor), and highlight from these results how different PHY options might be better suited to particular application use-cases. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2208.12983v1-abstract-full').style.display = 'none'; document.getElementById('2208.12983v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 August, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2207.11911">arXiv:2207.11911</a> <span> [<a href="https://arxiv.org/pdf/2207.11911">pdf</a>, <a href="https://arxiv.org/format/2207.11911">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> NeuMesh: Learning Disentangled Neural Mesh-based Implicit Field for Geometry and Texture Editing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+B">Bangbang Yang</a>, <a href="/search/cs?searchtype=author&query=Bao%2C+C">Chong Bao</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+J">Junyi Zeng</a>, <a href="/search/cs?searchtype=author&query=Bao%2C+H">Hujun Bao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yinda Zhang</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+Z">Zhaopeng Cui</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+G">Guofeng Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2207.11911v1-abstract-short" style="display: inline;"> Very recently neural implicit rendering techniques have been rapidly evolved and shown great advantages in novel view synthesis and 3D scene reconstruction. However, existing neural rendering methods for editing purposes offer limited functionality, e.g., rigid transformation, or not applicable for fine-grained editing for general objects from daily lives. In this paper, we present a novel mesh-ba… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2207.11911v1-abstract-full').style.display = 'inline'; document.getElementById('2207.11911v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2207.11911v1-abstract-full" style="display: none;"> Very recently neural implicit rendering techniques have been rapidly evolved and shown great advantages in novel view synthesis and 3D scene reconstruction. However, existing neural rendering methods for editing purposes offer limited functionality, e.g., rigid transformation, or not applicable for fine-grained editing for general objects from daily lives. In this paper, we present a novel mesh-based representation by encoding the neural implicit field with disentangled geometry and texture codes on mesh vertices, which facilitates a set of editing functionalities, including mesh-guided geometry editing, designated texture editing with texture swapping, filling and painting operations. To this end, we develop several techniques including learnable sign indicators to magnify spatial distinguishability of mesh-based representation, distillation and fine-tuning mechanism to make a steady convergence, and the spatial-aware optimization strategy to realize precise texture editing. Extensive experiments and editing examples on both real and synthetic data demonstrate the superiority of our method on representation quality and editing ability. Code is available on the project webpage: https://zju3dv.github.io/neumesh/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2207.11911v1-abstract-full').style.display = 'none'; document.getElementById('2207.11911v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 July, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ECCV 2022 (Oral). Project Page: https://zju3dv.github.io/neumesh/</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Bao%2C+C&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Bao%2C+C&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Bao%2C+C&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository