CINXE.COM
Search | arXiv e-print repository
<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 260 results for author: <span class="mathjax">Liao, J</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Liao%2C+J">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Liao, J"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Liao%2C+J&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Liao, J"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Liao%2C+J&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Liao%2C+J&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Liao%2C+J&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Liao%2C+J&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Liao%2C+J&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Liao%2C+J&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li> <a href="/search/?searchtype=author&query=Liao%2C+J&start=250" class="pagination-link " aria-label="Page 6" aria-current="page">6 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.14281">arXiv:2502.14281</a> <span> [<a href="https://arxiv.org/pdf/2502.14281">pdf</a>, <a href="https://arxiv.org/format/2502.14281">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Correcting Noisy Multilabel Predictions: Modeling Label Noise through Latent Space Shifts </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+W">Weipeng Huang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Q">Qin Li</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+Y">Yang Xiao</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+C">Cheng Qiao</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+T">Tie Cai</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+J">Junwei Liao</a>, <a href="/search/cs?searchtype=author&query=Hurley%2C+N+J">Neil J. Hurley</a>, <a href="/search/cs?searchtype=author&query=Piao%2C+G">Guangyuan Piao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.14281v1-abstract-short" style="display: inline;"> Noise in data appears to be inevitable in most real-world machine learning applications and would cause severe overfitting problems. Not only can data features contain noise, but labels are also prone to be noisy due to human input. In this paper, rather than noisy label learning in multiclass classifications, we instead focus on the less explored area of noisy label learning for multilabel classi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14281v1-abstract-full').style.display = 'inline'; document.getElementById('2502.14281v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.14281v1-abstract-full" style="display: none;"> Noise in data appears to be inevitable in most real-world machine learning applications and would cause severe overfitting problems. Not only can data features contain noise, but labels are also prone to be noisy due to human input. In this paper, rather than noisy label learning in multiclass classifications, we instead focus on the less explored area of noisy label learning for multilabel classifications. Specifically, we investigate the post-correction of predictions generated from classifiers learned with noisy labels. The reasons are two-fold. Firstly, this approach can directly work with the trained models to save computational resources. Secondly, it could be applied on top of other noisy label correction techniques to achieve further improvements. To handle this problem, we appeal to deep generative approaches that are possible for uncertainty estimation. Our model posits that label noise arises from a stochastic shift in the latent variable, providing a more robust and beneficial means for noisy learning. We develop both unsupervised and semi-supervised learning methods for our model. The extensive empirical study presents solid evidence to that our approach is able to consistently improve the independent models and performs better than a number of existing methods across various noisy label settings. Moreover, a comprehensive empirical analysis of the proposed method is carried out to validate its robustness, including sensitivity analysis and an ablation study, among other elements. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14281v1-abstract-full').style.display = 'none'; document.getElementById('2502.14281v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.12834">arXiv:2502.12834</a> <span> [<a href="https://arxiv.org/pdf/2502.12834">pdf</a>, <a href="https://arxiv.org/format/2502.12834">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> NTP-INT: Network Traffic Prediction-Driven In-band Network Telemetry for High-load Switches </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+P">Penghui Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hua Zhang</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+Y">Yuqi Dai</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+C">Cheng Zeng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jingyu Wang</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+J">Jianxin Liao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.12834v1-abstract-short" style="display: inline;"> In-band network telemetry (INT) is essential to network management due to its real-time visibility. However, because of the rapid increase in network devices and services, it has become crucial to have targeted access to detailed network information in a dynamic network environment. This paper proposes an intelligent network telemetry system called NTP-INT to obtain more fine-grained network infor… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12834v1-abstract-full').style.display = 'inline'; document.getElementById('2502.12834v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.12834v1-abstract-full" style="display: none;"> In-band network telemetry (INT) is essential to network management due to its real-time visibility. However, because of the rapid increase in network devices and services, it has become crucial to have targeted access to detailed network information in a dynamic network environment. This paper proposes an intelligent network telemetry system called NTP-INT to obtain more fine-grained network information on high-load switches. Specifically, NTP-INT consists of three modules: network traffic prediction module, network pruning module, and probe path planning module. Firstly, the network traffic prediction module adopts a Multi-Temporal Graph Neural Network (MTGNN) to predict future network traffic and identify high-load switches. Then, we design the network pruning algorithm to generate a subnetwork covering all high-load switches to reduce the complexity of probe path planning. Finally, the probe path planning module uses an attention-mechanism-based deep reinforcement learning (DEL) model to plan efficient probe paths in the network slice. The experimental results demonstrate that NTP-INT can acquire more precise network information on high-load switches while decreasing the control overhead by 50\%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12834v1-abstract-full').style.display = 'none'; document.getElementById('2502.12834v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.09624">arXiv:2502.09624</a> <span> [<a href="https://arxiv.org/pdf/2502.09624">pdf</a>, <a href="https://arxiv.org/format/2502.09624">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Efficient and Trustworthy Block Propagation for Blockchain-enabled Mobile Embodied AI Networks: A Graph Resfusion Approach </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kang%2C+J">Jiawen Kang</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+J">Jiana Liao</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+R">Runquan Gao</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+J">Jinbo Wen</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+H">Huawei Huang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Maomao Zhang</a>, <a href="/search/cs?searchtype=author&query=Yi%2C+C">Changyan Yi</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tao Zhang</a>, <a href="/search/cs?searchtype=author&query=Niyato%2C+D">Dusit Niyato</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Z">Zibin Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.09624v1-abstract-short" style="display: inline;"> By synergistically integrating mobile networks and embodied artificial intelligence (AI), Mobile Embodied AI Networks (MEANETs) represent an advanced paradigm that facilitates autonomous, context-aware, and interactive behaviors within dynamic environments. Nevertheless, the rapid development of MEANETs is accompanied by challenges in trustworthiness and operational efficiency. Fortunately, blockc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09624v1-abstract-full').style.display = 'inline'; document.getElementById('2502.09624v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.09624v1-abstract-full" style="display: none;"> By synergistically integrating mobile networks and embodied artificial intelligence (AI), Mobile Embodied AI Networks (MEANETs) represent an advanced paradigm that facilitates autonomous, context-aware, and interactive behaviors within dynamic environments. Nevertheless, the rapid development of MEANETs is accompanied by challenges in trustworthiness and operational efficiency. Fortunately, blockchain technology, with its decentralized and immutable characteristics, offers promising solutions for MEANETs. However, existing block propagation mechanisms suffer from challenges such as low propagation efficiency and weak security for block propagation, which results in delayed transmission of vehicular messages or vulnerability to malicious tampering, potentially causing severe traffic accidents in blockchain-enabled MEANETs. Moreover, current block propagation strategies cannot effectively adapt to real-time changes of dynamic topology in MEANETs. Therefore, in this paper, we propose a graph Resfusion model-based trustworthy block propagation optimization framework for consortium blockchain-enabled MEANETs. Specifically, we propose an innovative trust calculation mechanism based on the trust cloud model, which comprehensively accounts for randomness and fuzziness in the miner trust evaluation. Furthermore, by leveraging the strengths of graph neural networks and diffusion models, we develop a graph Resfusion model to effectively and adaptively generate the optimal block propagation trajectory. Simulation results demonstrate that the proposed model outperforms other routing mechanisms in terms of block propagation efficiency and trustworthiness. Additionally, the results highlight its strong adaptability to dynamic environments, making it particularly suitable for rapidly changing MEANETs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09624v1-abstract-full').style.display = 'none'; document.getElementById('2502.09624v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages, 11 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.02773">arXiv:2502.02773</a> <span> [<a href="https://arxiv.org/pdf/2502.02773">pdf</a>, <a href="https://arxiv.org/format/2502.02773">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SD++: Enhancing Standard Definition Maps by Incorporating Road Knowledge using LLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Diwanji%2C+H">Hitvarth Diwanji</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+J">Jing-Yan Liao</a>, <a href="/search/cs?searchtype=author&query=Tumu%2C+A">Akshar Tumu</a>, <a href="/search/cs?searchtype=author&query=Christensen%2C+H+I">Henrik I. Christensen</a>, <a href="/search/cs?searchtype=author&query=Vazquez-Chanlatte%2C+M">Marcell Vazquez-Chanlatte</a>, <a href="/search/cs?searchtype=author&query=Tsuchiya%2C+C">Chikao Tsuchiya</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.02773v1-abstract-short" style="display: inline;"> High-definition maps (HD maps) are detailed and informative maps capturing lane centerlines and road elements. Although very useful for autonomous driving, HD maps are costly to build and maintain. Furthermore, access to these high-quality maps is usually limited to the firms that build them. On the other hand, standard definition (SD) maps provide road centerlines with an accuracy of a few meters… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02773v1-abstract-full').style.display = 'inline'; document.getElementById('2502.02773v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.02773v1-abstract-full" style="display: none;"> High-definition maps (HD maps) are detailed and informative maps capturing lane centerlines and road elements. Although very useful for autonomous driving, HD maps are costly to build and maintain. Furthermore, access to these high-quality maps is usually limited to the firms that build them. On the other hand, standard definition (SD) maps provide road centerlines with an accuracy of a few meters. In this paper, we explore the possibility of enhancing SD maps by incorporating information from road manuals using LLMs. We develop SD++, an end-to-end pipeline to enhance SD maps with location-dependent road information obtained from a road manual. We suggest and compare several ways of using LLMs for such a task. Furthermore, we show the generalization ability of SD++ by showing results from both California and Japan. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02773v1-abstract-full').style.display = 'none'; document.getElementById('2502.02773v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.01344">arXiv:2502.01344</a> <span> [<a href="https://arxiv.org/pdf/2502.01344">pdf</a>, <a href="https://arxiv.org/format/2502.01344">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> PSSD: Making Large Language Models Self-denial via Human Psyche Structure </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liao%2C+J">Jinzhi Liao</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+Z">Zenghua Liao</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+X">Xiang Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.01344v1-abstract-short" style="display: inline;"> The enhance of accuracy in reasoning results of LLMs arouses the community's interests, wherein pioneering studies investigate post-hoc strategies to rectify potential mistakes. Despite extensive efforts, they are all stuck in a state of resource competition demanding significant time and computing expenses. The cause of the situation lies in the failure of identifying the fundamental feature of t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01344v1-abstract-full').style.display = 'inline'; document.getElementById('2502.01344v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.01344v1-abstract-full" style="display: none;"> The enhance of accuracy in reasoning results of LLMs arouses the community's interests, wherein pioneering studies investigate post-hoc strategies to rectify potential mistakes. Despite extensive efforts, they are all stuck in a state of resource competition demanding significant time and computing expenses. The cause of the situation lies in the failure of identifying the fundamental feature of the solutions in this line, coined as the self-denial of LLMs. In other words, LLMs should confidently determine the potential existence of mistakes and carefully execute the targeted correction. As the whole procedure conducts within LLMs, supporting and persuasive references are hard to acquire, while the absence of specific steps towards refining hidden mistakes persists even when errors are acknowledged. In response to the challenges, we present PSSD, which refers to and implements the human psyche structure such that three distinct and interconnected roles contribute to human reasoning. Specifically, PSSD leverages the recent multi-agent paradigm, and is further enhanced with three innovatively conceived roles: (1) the intuition-based id role that provides initial attempts based on benign LLMs; (2) the rule-driven superego role that summarizes rules to regulate the above attempts, and returns specific key points as guidance; and (3) the script-centric ego role that absorbs all procedural information to generate executable script for the final answer prediction. Extensive experiments demonstrate that the proposed design not only better enhance reasoning capabilities, but also seamlessly integrate with current models, leading to superior performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01344v1-abstract-full').style.display = 'none'; document.getElementById('2502.01344v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">WWW '25</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.15087">arXiv:2501.15087</a> <span> [<a href="https://arxiv.org/pdf/2501.15087">pdf</a>, <a href="https://arxiv.org/format/2501.15087">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> PatchRec: Multi-Grained Patching for Efficient LLM-based Sequential Recommendation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liao%2C+J">Jiayi Liao</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+R">Ruobing Xie</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Sihang Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiang Wang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+X">Xingwu Sun</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+Z">Zhanhui Kang</a>, <a href="/search/cs?searchtype=author&query=He%2C+X">Xiangnan He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.15087v1-abstract-short" style="display: inline;"> Large Language Models for sequential recommendation (LLM4SR), which transform user-item interactions into language modeling, have shown promising results. However, due to the limitations of context window size and the computational costs associated with Large Language Models (LLMs), current approaches primarily truncate user history by only considering the textual information of items from the mos… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15087v1-abstract-full').style.display = 'inline'; document.getElementById('2501.15087v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.15087v1-abstract-full" style="display: none;"> Large Language Models for sequential recommendation (LLM4SR), which transform user-item interactions into language modeling, have shown promising results. However, due to the limitations of context window size and the computational costs associated with Large Language Models (LLMs), current approaches primarily truncate user history by only considering the textual information of items from the most recent interactions in the input prompt. This truncation fails to fully capture the long-term behavioral patterns of users. To address this, we propose a multi-grained patching framework -- PatchRec. It compresses the textual tokens of an item title into a compact item patch, and further compresses multiple item patches into a denser session patch, with earlier interactions being compressed to a greater degree. The framework consists of two stages: (1) Patch Pre-training, which familiarizes LLMs with item-level compression patterns, and (2) Patch Fine-tuning, which teaches LLMs to model sequences at multiple granularities. Through this simple yet effective approach, empirical results demonstrate that PatchRec outperforms existing methods, achieving significant performance gains with fewer tokens fed to the LLM. Specifically, PatchRec shows up to a 32% improvement in HR@20 on the Goodreads dataset over uncompressed baseline, while using only 7% of the tokens. This multi-grained sequence modeling paradigm, with an adjustable compression ratio, enables LLMs to be efficiently deployed in real-world recommendation systems that handle extremely long user behavior sequences. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15087v1-abstract-full').style.display = 'none'; document.getElementById('2501.15087v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.10917">arXiv:2501.10917</a> <span> [<a href="https://arxiv.org/pdf/2501.10917">pdf</a>, <a href="https://arxiv.org/format/2501.10917">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Decomposing and Fusing Intra- and Inter-Sensor Spatio-Temporal Signal for Multi-Sensor Wearable Human Activity Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xie%2C+H">Haoyu Xie</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Haoxuan Li</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+C">Chunyuan Zheng</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+H">Haonan Yuan</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+G">Guorui Liao</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+J">Jun Liao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Li Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.10917v1-abstract-short" style="display: inline;"> Wearable Human Activity Recognition (WHAR) is a prominent research area within ubiquitous computing. Multi-sensor synchronous measurement has proven to be more effective for WHAR than using a single sensor. However, existing WHAR methods use shared convolutional kernels for indiscriminate temporal feature extraction across each sensor variable, which fails to effectively capture spatio-temporal re… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.10917v1-abstract-full').style.display = 'inline'; document.getElementById('2501.10917v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.10917v1-abstract-full" style="display: none;"> Wearable Human Activity Recognition (WHAR) is a prominent research area within ubiquitous computing. Multi-sensor synchronous measurement has proven to be more effective for WHAR than using a single sensor. However, existing WHAR methods use shared convolutional kernels for indiscriminate temporal feature extraction across each sensor variable, which fails to effectively capture spatio-temporal relationships of intra-sensor and inter-sensor variables. We propose the DecomposeWHAR model consisting of a decomposition phase and a fusion phase to better model the relationships between modality variables. The decomposition creates high-dimensional representations of each intra-sensor variable through the improved Depth Separable Convolution to capture local temporal features while preserving their unique characteristics. The fusion phase begins by capturing relationships between intra-sensor variables and fusing their features at both the channel and variable levels. Long-range temporal dependencies are modeled using the State Space Model (SSM), and later cross-sensor interactions are dynamically captured through a self-attention mechanism, highlighting inter-sensor spatial correlations. Our model demonstrates superior performance on three widely used WHAR datasets, significantly outperforming state-of-the-art models while maintaining acceptable computational efficiency. Our codes and supplementary materials are available at https://github.com/Anakin2555/DecomposeWHAR. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.10917v1-abstract-full').style.display = 'none'; document.getElementById('2501.10917v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.09921">arXiv:2501.09921</a> <span> [<a href="https://arxiv.org/pdf/2501.09921">pdf</a>, <a href="https://arxiv.org/format/2501.09921">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> TalkingEyes: Pluralistic Speech-Driven 3D Eye Gaze Animation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhuang%2C+Y">Yixiang Zhuang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+C">Chunshan Ma</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+Y">Yao Cheng</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+X">Xuan Cheng</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+J">Jing Liao</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+J">Juncong Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.09921v1-abstract-short" style="display: inline;"> Although significant progress has been made in the field of speech-driven 3D facial animation recently, the speech-driven animation of an indispensable facial component, eye gaze, has been overlooked by recent research. This is primarily due to the weak correlation between speech and eye gaze, as well as the scarcity of audio-gaze data, making it very challenging to generate 3D eye gaze motion fro… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.09921v1-abstract-full').style.display = 'inline'; document.getElementById('2501.09921v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.09921v1-abstract-full" style="display: none;"> Although significant progress has been made in the field of speech-driven 3D facial animation recently, the speech-driven animation of an indispensable facial component, eye gaze, has been overlooked by recent research. This is primarily due to the weak correlation between speech and eye gaze, as well as the scarcity of audio-gaze data, making it very challenging to generate 3D eye gaze motion from speech alone. In this paper, we propose a novel data-driven method which can generate diverse 3D eye gaze motions in harmony with the speech. To achieve this, we firstly construct an audio-gaze dataset that contains about 14 hours of audio-mesh sequences featuring high-quality eye gaze motion, head motion and facial motion simultaneously. The motion data is acquired by performing lightweight eye gaze fitting and face reconstruction on videos from existing audio-visual datasets. We then tailor a novel speech-to-motion translation framework in which the head motions and eye gaze motions are jointly generated from speech but are modeled in two separate latent spaces. This design stems from the physiological knowledge that the rotation range of eyeballs is less than that of head. Through mapping the speech embedding into the two latent spaces, the difficulty in modeling the weak correlation between speech and non-verbal motion is thus attenuated. Finally, our TalkingEyes, integrated with a speech-driven 3D facial motion generator, can synthesize eye gaze motion, eye blinks, head motion and facial motion collectively from speech. Extensive quantitative and qualitative evaluations demonstrate the superiority of the proposed method in generating diverse and natural 3D eye gaze motions from speech. The project page of this paper is: https://lkjkjoiuiu.github.io/TalkingEyes_Home/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.09921v1-abstract-full').style.display = 'none'; document.getElementById('2501.09921v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.11706">arXiv:2412.11706</a> <span> [<a href="https://arxiv.org/pdf/2412.11706">pdf</a>, <a href="https://arxiv.org/format/2412.11706">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> AsymRnR: Video Diffusion Transformers Acceleration with Asymmetric Reduction and Restoration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+W">Wenhao Sun</a>, <a href="/search/cs?searchtype=author&query=Tu%2C+R">Rong-Cheng Tu</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+J">Jingyi Liao</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+Z">Zhao Jin</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+D">Dacheng Tao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.11706v1-abstract-short" style="display: inline;"> Video Diffusion Transformers (DiTs) have demonstrated significant potential for generating high-fidelity videos but are computationally intensive. Existing acceleration methods include distillation, which requires costly retraining, and feature caching, which is highly sensitive to network architecture. Recent token reduction methods are training-free and architecture-agnostic, offering greater fl… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.11706v1-abstract-full').style.display = 'inline'; document.getElementById('2412.11706v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.11706v1-abstract-full" style="display: none;"> Video Diffusion Transformers (DiTs) have demonstrated significant potential for generating high-fidelity videos but are computationally intensive. Existing acceleration methods include distillation, which requires costly retraining, and feature caching, which is highly sensitive to network architecture. Recent token reduction methods are training-free and architecture-agnostic, offering greater flexibility and wider applicability. However, they enforce the same sequence length across different components, constraining their acceleration potential. We observe that intra-sequence redundancy in video DiTs varies across features, blocks, and denoising timesteps. Building on this observation, we propose Asymmetric Reduction and Restoration (AsymRnR), a training-free approach to accelerate video DiTs. It offers a flexible and adaptive strategy that reduces the number of tokens based on their redundancy to enhance both acceleration and generation quality. We further propose matching cache to facilitate faster processing. Integrated into state-of-the-art video DiTs, AsymRnR achieves a superior speedup without compromising the quality. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.11706v1-abstract-full').style.display = 'none'; document.getElementById('2412.11706v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 7 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.11376">arXiv:2412.11376</a> <span> [<a href="https://arxiv.org/pdf/2412.11376">pdf</a>, <a href="https://arxiv.org/format/2412.11376">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> ChatTime: A Unified Multimodal Time Series Foundation Model Bridging Numerical and Textual Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chengsen Wang</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+Q">Qi Qi</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jingyu Wang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+H">Haifeng Sun</a>, <a href="/search/cs?searchtype=author&query=Zhuang%2C+Z">Zirui Zhuang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jinming Wu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Lei Zhang</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+J">Jianxin Liao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.11376v1-abstract-short" style="display: inline;"> Human experts typically integrate numerical and textual multimodal information to analyze time series. However, most traditional deep learning predictors rely solely on unimodal numerical data, using a fixed-length window for training and prediction on a single dataset, and cannot adapt to different scenarios. The powered pre-trained large language model has introduced new opportunities for time s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.11376v1-abstract-full').style.display = 'inline'; document.getElementById('2412.11376v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.11376v1-abstract-full" style="display: none;"> Human experts typically integrate numerical and textual multimodal information to analyze time series. However, most traditional deep learning predictors rely solely on unimodal numerical data, using a fixed-length window for training and prediction on a single dataset, and cannot adapt to different scenarios. The powered pre-trained large language model has introduced new opportunities for time series analysis. Yet, existing methods are either inefficient in training, incapable of handling textual information, or lack zero-shot forecasting capability. In this paper, we innovatively model time series as a foreign language and construct ChatTime, a unified framework for time series and text processing. As an out-of-the-box multimodal time series foundation model, ChatTime provides zero-shot forecasting capability and supports bimodal input/output for both time series and text. We design a series of experiments to verify the superior performance of ChatTime across multiple tasks and scenarios, and create four multimodal datasets to address data gaps. The experimental results demonstrate the potential and utility of ChatTime. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.11376v1-abstract-full').style.display = 'none'; document.getElementById('2412.11376v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by AAAI 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.10770">arXiv:2412.10770</a> <span> [<a href="https://arxiv.org/pdf/2412.10770">pdf</a>, <a href="https://arxiv.org/format/2412.10770">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Learned Data Compression: Challenges and Opportunities for the Future </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qiyu Liu</a>, <a href="/search/cs?searchtype=author&query=Han%2C+S">Siyuan Han</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+J">Jianwei Liao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jin Li</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+J">Jingshu Peng</a>, <a href="/search/cs?searchtype=author&query=Du%2C+J">Jun Du</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Lei Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.10770v1-abstract-short" style="display: inline;"> Compressing integer keys is a fundamental operation among multiple communities, such as database management (DB), information retrieval (IR), and high-performance computing (HPC). Recent advances in \emph{learned indexes} have inspired the development of \emph{learned compressors}, which leverage simple yet compact machine learning (ML) models to compress large-scale sorted keys. The core idea beh… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.10770v1-abstract-full').style.display = 'inline'; document.getElementById('2412.10770v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.10770v1-abstract-full" style="display: none;"> Compressing integer keys is a fundamental operation among multiple communities, such as database management (DB), information retrieval (IR), and high-performance computing (HPC). Recent advances in \emph{learned indexes} have inspired the development of \emph{learned compressors}, which leverage simple yet compact machine learning (ML) models to compress large-scale sorted keys. The core idea behind learned compressors is to \emph{losslessly} encode sorted keys by approximating them with \emph{error-bounded} ML models (e.g., piecewise linear functions) and using a \emph{residual array} to guarantee accurate key reconstruction. While the concept of learned compressors remains in its early stages of exploration, our benchmark results demonstrate that an SIMD-optimized learned compressor can significantly outperform state-of-the-art CPU-based compressors. Drawing on our preliminary experiments, this vision paper explores the potential of learned data compression to enhance critical areas in DBMS and related domains. Furthermore, we outline the key technical challenges that existing systems must address when integrating this emerging methodology. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.10770v1-abstract-full').style.display = 'none'; document.getElementById('2412.10770v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.08939">arXiv:2412.08939</a> <span> [<a href="https://arxiv.org/pdf/2412.08939">pdf</a>, <a href="https://arxiv.org/format/2412.08939">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Dynamic Contrastive Knowledge Distillation for Efficient Image Restoration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yunshuai Zhou</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+J">Junbo Qiao</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+J">Jincheng Liao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wei Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Simiao Li</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+J">Jiao Xie</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+Y">Yunhang Shen</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+J">Jie Hu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+S">Shaohui Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.08939v2-abstract-short" style="display: inline;"> Knowledge distillation (KD) is a valuable yet challenging approach that enhances a compact student network by learning from a high-performance but cumbersome teacher model. However, previous KD methods for image restoration overlook the state of the student during the distillation, adopting a fixed solution space that limits the capability of KD. Additionally, relying solely on L1-type loss strugg… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.08939v2-abstract-full').style.display = 'inline'; document.getElementById('2412.08939v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.08939v2-abstract-full" style="display: none;"> Knowledge distillation (KD) is a valuable yet challenging approach that enhances a compact student network by learning from a high-performance but cumbersome teacher model. However, previous KD methods for image restoration overlook the state of the student during the distillation, adopting a fixed solution space that limits the capability of KD. Additionally, relying solely on L1-type loss struggles to leverage the distribution information of images. In this work, we propose a novel dynamic contrastive knowledge distillation (DCKD) framework for image restoration. Specifically, we introduce dynamic contrastive regularization to perceive the student's learning state and dynamically adjust the distilled solution space using contrastive learning. Additionally, we also propose a distribution mapping module to extract and align the pixel-level category distribution of the teacher and student models. Note that the proposed DCKD is a structure-agnostic distillation framework, which can adapt to different backbones and can be combined with methods that optimize upper-bound constraints to further enhance model performance. Extensive experiments demonstrate that DCKD significantly outperforms the state-of-the-art KD methods across various image restoration tasks and backbones. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.08939v2-abstract-full').style.display = 'none'; document.getElementById('2412.08939v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.07367">arXiv:2412.07367</a> <span> [<a href="https://arxiv.org/pdf/2412.07367">pdf</a>, <a href="https://arxiv.org/format/2412.07367">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> My Words Imply Your Opinion: Reader Agent-Based Propagation Enhancement for Personalized Implicit Emotion Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liao%2C+J">Jian Liao</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Y">Yu Feng</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Y">Yujin Zheng</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+J">Jun Zhao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Suge Wang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+J">Jianxing Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.07367v2-abstract-short" style="display: inline;"> The subtlety of emotional expressions makes implicit emotion analysis (IEA) particularly sensitive to user-specific characteristics. Current studies personalize emotion analysis by focusing on the author but neglect the impact of the intended reader on implicit emotional feedback. In this paper, we introduce Personalized IEA (PIEA) and present the RAPPIE model, which addresses subjective variabili… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.07367v2-abstract-full').style.display = 'inline'; document.getElementById('2412.07367v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.07367v2-abstract-full" style="display: none;"> The subtlety of emotional expressions makes implicit emotion analysis (IEA) particularly sensitive to user-specific characteristics. Current studies personalize emotion analysis by focusing on the author but neglect the impact of the intended reader on implicit emotional feedback. In this paper, we introduce Personalized IEA (PIEA) and present the RAPPIE model, which addresses subjective variability by incorporating reader feedback. In particular, (1) we create reader agents based on large language models to simulate reader feedback, overcoming the issue of ``spiral of silence effect'' and data incompleteness of real reader reaction. (2) We develop a role-aware multi-view graph learning to model the emotion interactive propagation process in scenarios with sparse reader information. (3) We construct two new PIEA datasets covering English and Chinese social media with detailed user metadata, addressing the text-centric limitation of existing datasets. Extensive experiments show that RAPPIE significantly outperforms state-of-the-art baselines, demonstrating the value of incorporating reader feedback in PIEA. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.07367v2-abstract-full').style.display = 'none'; document.getElementById('2412.07367v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.18983">arXiv:2411.18983</a> <span> [<a href="https://arxiv.org/pdf/2411.18983">pdf</a>, <a href="https://arxiv.org/format/2411.18983">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> </div> <p class="title is-5 mathjax"> SPAgent: Adaptive Task Decomposition and Model Selection for General Video Generation and Editing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tu%2C+R">Rong-Cheng Tu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+W">Wenhao Sun</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+Z">Zhao Jin</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+J">Jingyi Liao</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jiaxing Huang</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+D">Dacheng Tao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.18983v1-abstract-short" style="display: inline;"> While open-source video generation and editing models have made significant progress, individual models are typically limited to specific tasks, failing to meet the diverse needs of users. Effectively coordinating these models can unlock a wide range of video generation and editing capabilities. However, manual coordination is complex and time-consuming, requiring users to deeply understand task r… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18983v1-abstract-full').style.display = 'inline'; document.getElementById('2411.18983v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.18983v1-abstract-full" style="display: none;"> While open-source video generation and editing models have made significant progress, individual models are typically limited to specific tasks, failing to meet the diverse needs of users. Effectively coordinating these models can unlock a wide range of video generation and editing capabilities. However, manual coordination is complex and time-consuming, requiring users to deeply understand task requirements and possess comprehensive knowledge of each model's performance, applicability, and limitations, thereby increasing the barrier to entry. To address these challenges, we propose a novel video generation and editing system powered by our Semantic Planning Agent (SPAgent). SPAgent bridges the gap between diverse user intents and the effective utilization of existing generative models, enhancing the adaptability, efficiency, and overall quality of video generation and editing. Specifically, the SPAgent assembles a tool library integrating state-of-the-art open-source image and video generation and editing models as tools. After fine-tuning on our manually annotated dataset, SPAgent can automatically coordinate the tools for video generation and editing, through our novelly designed three-step framework: (1) decoupled intent recognition, (2) principle-guided route planning, and (3) capability-based execution model selection. Additionally, we enhance the SPAgent's video quality evaluation capability, enabling it to autonomously assess and incorporate new video generation and editing models into its tool library without human intervention. Experimental results demonstrate that the SPAgent effectively coordinates models to generate or edit videos, highlighting its versatility and adaptability across various video tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18983v1-abstract-full').style.display = 'none'; document.getElementById('2411.18983v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.17605">arXiv:2411.17605</a> <span> [<a href="https://arxiv.org/pdf/2411.17605">pdf</a>, <a href="https://arxiv.org/format/2411.17605">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Distractor-free Generalizable 3D Gaussian Splatting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bao%2C+Y">Yanqi Bao</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+J">Jing Liao</a>, <a href="/search/cs?searchtype=author&query=Huo%2C+J">Jing Huo</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yang Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.17605v1-abstract-short" style="display: inline;"> We present DGGS, a novel framework addressing the previously unexplored challenge of Distractor-free Generalizable 3D Gaussian Splatting (3DGS). It accomplishes two key objectives: fortifying generalizable 3DGS against distractor-laden data during both training and inference phases, while successfully extending cross-scene adaptation capabilities to conventional distractor-free approaches. To achi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17605v1-abstract-full').style.display = 'inline'; document.getElementById('2411.17605v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.17605v1-abstract-full" style="display: none;"> We present DGGS, a novel framework addressing the previously unexplored challenge of Distractor-free Generalizable 3D Gaussian Splatting (3DGS). It accomplishes two key objectives: fortifying generalizable 3DGS against distractor-laden data during both training and inference phases, while successfully extending cross-scene adaptation capabilities to conventional distractor-free approaches. To achieve these objectives, DGGS introduces a scene-agnostic reference-based mask prediction and refinement methodology during training phase, coupled with a training view selection strategy, effectively improving distractor prediction accuracy and training stability. Moreover, to address distractor-induced voids and artifacts during inference stage, we propose a two-stage inference framework for better reference selection based on the predicted distractor masks, complemented by a distractor pruning module to eliminate residual distractor effects. Extensive generalization experiments demonstrate DGGS's advantages under distractor-laden conditions. Additionally, experimental results show that our scene-agnostic mask inference achieves accuracy comparable to scene-specific trained methods. Homepage is \url{https://github.com/bbbbby-99/DGGS}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17605v1-abstract-full').style.display = 'none'; document.getElementById('2411.17605v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.16602">arXiv:2411.16602</a> <span> [<a href="https://arxiv.org/pdf/2411.16602">pdf</a>, <a href="https://arxiv.org/format/2411.16602">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> Chat2SVG: Vector Graphics Generation with Large Language Models and Image Diffusion Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+R">Ronghuan Wu</a>, <a href="/search/cs?searchtype=author&query=Su%2C+W">Wanchao Su</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+J">Jing Liao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.16602v1-abstract-short" style="display: inline;"> Scalable Vector Graphics (SVG) has become the de facto standard for vector graphics in digital design, offering resolution independence and precise control over individual elements. Despite their advantages, creating high-quality SVG content remains challenging, as it demands technical expertise with professional editing software and a considerable time investment to craft complex shapes. Recent t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16602v1-abstract-full').style.display = 'inline'; document.getElementById('2411.16602v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.16602v1-abstract-full" style="display: none;"> Scalable Vector Graphics (SVG) has become the de facto standard for vector graphics in digital design, offering resolution independence and precise control over individual elements. Despite their advantages, creating high-quality SVG content remains challenging, as it demands technical expertise with professional editing software and a considerable time investment to craft complex shapes. Recent text-to-SVG generation methods aim to make vector graphics creation more accessible, but they still encounter limitations in shape regularity, generalization ability, and expressiveness. To address these challenges, we introduce Chat2SVG, a hybrid framework that combines the strengths of Large Language Models (LLMs) and image diffusion models for text-to-SVG generation. Our approach first uses an LLM to generate semantically meaningful SVG templates from basic geometric primitives. Guided by image diffusion models, a dual-stage optimization pipeline refines paths in latent space and adjusts point coordinates to enhance geometric complexity. Extensive experiments show that Chat2SVG outperforms existing methods in visual fidelity, path regularity, and semantic alignment. Additionally, our system enables intuitive editing through natural language instructions, making professional vector graphics creation accessible to all users. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16602v1-abstract-full').style.display = 'none'; document.getElementById('2411.16602v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project Page: https://chat2svg.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.16061">arXiv:2411.16061</a> <span> [<a href="https://arxiv.org/pdf/2411.16061">pdf</a>, <a href="https://arxiv.org/format/2411.16061">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/TPAMI.2025.3530246">10.1109/TPAMI.2025.3530246 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Scaling Spike-driven Transformer with Efficient Spike Firing Approximation Training </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yao%2C+M">Man Yao</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+X">Xuerui Qiu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+T">Tianxiang Hu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+J">Jiakui Hu</a>, <a href="/search/cs?searchtype=author&query=Chou%2C+Y">Yuhong Chou</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+K">Keyu Tian</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+J">Jianxing Liao</a>, <a href="/search/cs?searchtype=author&query=Leng%2C+L">Luziwei Leng</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+B">Bo Xu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Guoqi Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.16061v1-abstract-short" style="display: inline;"> The ambition of brain-inspired Spiking Neural Networks (SNNs) is to become a low-power alternative to traditional Artificial Neural Networks (ANNs). This work addresses two major challenges in realizing this vision: the performance gap between SNNs and ANNs, and the high training costs of SNNs. We identify intrinsic flaws in spiking neurons caused by binary firing mechanisms and propose a Spike Fi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16061v1-abstract-full').style.display = 'inline'; document.getElementById('2411.16061v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.16061v1-abstract-full" style="display: none;"> The ambition of brain-inspired Spiking Neural Networks (SNNs) is to become a low-power alternative to traditional Artificial Neural Networks (ANNs). This work addresses two major challenges in realizing this vision: the performance gap between SNNs and ANNs, and the high training costs of SNNs. We identify intrinsic flaws in spiking neurons caused by binary firing mechanisms and propose a Spike Firing Approximation (SFA) method using integer training and spike-driven inference. This optimizes the spike firing pattern of spiking neurons, enhancing efficient training, reducing power consumption, improving performance, enabling easier scaling, and better utilizing neuromorphic chips. We also develop an efficient spike-driven Transformer architecture and a spike-masked autoencoder to prevent performance degradation during SNN scaling. On ImageNet-1k, we achieve state-of-the-art top-1 accuracy of 78.5\%, 79.8\%, 84.0\%, and 86.2\% with models containing 10M, 19M, 83M, and 173M parameters, respectively. For instance, the 10M model outperforms the best existing SNN by 7.2\% on ImageNet, with training time acceleration and inference energy efficiency improved by 4.5$\times$ and 3.9$\times$, respectively. We validate the effectiveness and efficiency of the proposed method across various tasks, including object detection, semantic segmentation, and neuromorphic vision tasks. This work enables SNNs to match ANN performance while maintaining the low-power advantage, marking a significant step towards SNNs as a general visual backbone. Code is available at https://github.com/BICLab/Spike-Driven-Transformer-V3. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16061v1-abstract-full').style.display = 'none'; document.getElementById('2411.16061v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07176">arXiv:2411.07176</a> <span> [<a href="https://arxiv.org/pdf/2411.07176">pdf</a>, <a href="https://arxiv.org/format/2411.07176">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> More Expressive Attention with Negative Weights </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lv%2C+A">Ang Lv</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+R">Ruobing Xie</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shuaipeng Li</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+J">Jiayi Liao</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+X">Xingwu Sun</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+Z">Zhanhui Kang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+D">Di Wang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+R">Rui Yan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07176v3-abstract-short" style="display: inline;"> We propose a novel attention mechanism, named Cog Attention, that enables attention weights to be negative for enhanced expressiveness, which stems from two key factors: (1) Cog Attention enhances parameter flexibility. For example, unlike traditional softmax attention heads that use a static output-value (OV) matrix to delete or copy inputs that the heads attend to, Cog Attention naturally learns… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07176v3-abstract-full').style.display = 'inline'; document.getElementById('2411.07176v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07176v3-abstract-full" style="display: none;"> We propose a novel attention mechanism, named Cog Attention, that enables attention weights to be negative for enhanced expressiveness, which stems from two key factors: (1) Cog Attention enhances parameter flexibility. For example, unlike traditional softmax attention heads that use a static output-value (OV) matrix to delete or copy inputs that the heads attend to, Cog Attention naturally learns to use the sign of dynamic query-key (QK) inner products to represent these operations. This enables Cog Attention to perform multiple operations simultaneously within a single head. Meanwhile, Cog Attention's OV matrix can focus more on refinement or modification. (2) Cog Attention enhances the model's robustness against representational collapse by preventing the ``over-squashing'' of earlier tokens into later positions. We develop Transformer-like models which use Cog Attention as attention modules, including decoder-only models at various scales for language modeling and U-ViT diffusion models for image generation. Experiments show that models using Cog Attention exhibit superior performance compared to those employing traditional softmax attention modules. Our approach suggests a promising research direction for rethinking and breaking the entrenched constraints of traditional softmax attention, such as the requirement for non-negative weights. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07176v3-abstract-full').style.display = 'none'; document.getElementById('2411.07176v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.17694">arXiv:2410.17694</a> <span> [<a href="https://arxiv.org/pdf/2410.17694">pdf</a>, <a href="https://arxiv.org/format/2410.17694">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> An Adaptive Framework for Generating Systematic Explanatory Answer in Online Q&A Platforms </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Ziyang Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiaobin Wang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Y">Yong Jiang</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+J">Jinzhi Liao</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+P">Pengjun Xie</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+F">Fei Huang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+X">Xiang Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.17694v1-abstract-short" style="display: inline;"> Question Answering (QA) systems face challenges in handling complex questions that require multi-domain knowledge synthesis. The naive RAG models, although effective in information retrieval, struggle with complex questions that require comprehensive and in-depth answers. The pioneering task is defined as explanatory answer generation, which entails handling identified challenges such as the requi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17694v1-abstract-full').style.display = 'inline'; document.getElementById('2410.17694v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.17694v1-abstract-full" style="display: none;"> Question Answering (QA) systems face challenges in handling complex questions that require multi-domain knowledge synthesis. The naive RAG models, although effective in information retrieval, struggle with complex questions that require comprehensive and in-depth answers. The pioneering task is defined as explanatory answer generation, which entails handling identified challenges such as the requirement for comprehensive information and logical coherence within the generated context. To address these issues, we refer to systematic thinking theory and propose SynthRAG, an innovative framework designed to enhance QA performance. SynthRAG improves on conventional models by employing adaptive outlines for dynamic content structuring, generating systematic information to ensure detailed coverage, and producing customized answers tailored to specific user inquiries. This structured approach guarantees logical coherence and thorough integration of information, yielding responses that are both insightful and methodically organized. Empirical evaluations underscore SynthRAG's effectiveness, demonstrating its superiority in handling complex questions, overcoming the limitations of naive RAG models, and significantly improving answer quality and depth. Furthermore, an online deployment on the Zhihu platform revealed that SynthRAG's answers achieved notable user engagement, with each response averaging 5.73 upvotes and surpassing the performance of 79.8% of human contributors, highlighting the practical relevance and impact of the proposed framework. Our code is available at https://github.com/czy1999/SynthRAG . <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17694v1-abstract-full').style.display = 'none'; document.getElementById('2410.17694v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 6 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.7 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.12519">arXiv:2410.12519</a> <span> [<a href="https://arxiv.org/pdf/2410.12519">pdf</a>, <a href="https://arxiv.org/format/2410.12519">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> RosePO: Aligning LLM-based Recommenders with Human Values </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liao%2C+J">Jiayi Liao</a>, <a href="/search/cs?searchtype=author&query=He%2C+X">Xiangnan He</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+R">Ruobing Xie</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jiancan Wu</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Yancheng Yuan</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+X">Xingwu Sun</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+Z">Zhanhui Kang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiang Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.12519v1-abstract-short" style="display: inline;"> Recently, there has been a growing interest in leveraging Large Language Models (LLMs) for recommendation systems, which usually adapt a pre-trained LLM to the recommendation scenario through supervised fine-tuning (SFT). However, both the pre-training and SFT stages fail to explicitly model the comparative relationships of a user's preferences on different items. To construct a "helpful and harml… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12519v1-abstract-full').style.display = 'inline'; document.getElementById('2410.12519v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.12519v1-abstract-full" style="display: none;"> Recently, there has been a growing interest in leveraging Large Language Models (LLMs) for recommendation systems, which usually adapt a pre-trained LLM to the recommendation scenario through supervised fine-tuning (SFT). However, both the pre-training and SFT stages fail to explicitly model the comparative relationships of a user's preferences on different items. To construct a "helpful and harmless" LLM-based recommender, we propose a general framework -- Recommendation with smoothing personalized Preference Optimization (RosePO), which better aligns with customized human values during the post-training stage. Specifically, in addition to the input and chosen response that naturally align with SFT data, we design a rejected sampling strategy tailored for enhancing helpfulness, along with two strategies aimed at mitigating biases to promote harmlessness. To ensure robustness against uncertain labels present in automatically constructed preference data, we introduce a personalized smoothing factor predicted by a preference oracle into the optimization objective. Evaluation on three real-world datasets demonstrates the effectiveness of our method, showcasing not only improved recommendation performance but also mitigation of semantic hallucination and popularity bias. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12519v1-abstract-full').style.display = 'none'; document.getElementById('2410.12519v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.11815">arXiv:2410.11815</a> <span> [<a href="https://arxiv.org/pdf/2410.11815">pdf</a>, <a href="https://arxiv.org/format/2410.11815">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SGEdit: Bridging LLM with Text2Image Generative Model for Scene Graph-based Image Editing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhiyuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+D">DongDong Chen</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+J">Jing Liao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.11815v1-abstract-short" style="display: inline;"> Scene graphs offer a structured, hierarchical representation of images, with nodes and edges symbolizing objects and the relationships among them. It can serve as a natural interface for image editing, dramatically improving precision and flexibility. Leveraging this benefit, we introduce a new framework that integrates large language model (LLM) with Text2Image generative model for scene graph-ba… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11815v1-abstract-full').style.display = 'inline'; document.getElementById('2410.11815v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.11815v1-abstract-full" style="display: none;"> Scene graphs offer a structured, hierarchical representation of images, with nodes and edges symbolizing objects and the relationships among them. It can serve as a natural interface for image editing, dramatically improving precision and flexibility. Leveraging this benefit, we introduce a new framework that integrates large language model (LLM) with Text2Image generative model for scene graph-based image editing. This integration enables precise modifications at the object level and creative recomposition of scenes without compromising overall image integrity. Our approach involves two primary stages: 1) Utilizing a LLM-driven scene parser, we construct an image's scene graph, capturing key objects and their interrelationships, as well as parsing fine-grained attributes such as object masks and descriptions. These annotations facilitate concept learning with a fine-tuned diffusion model, representing each object with an optimized token and detailed description prompt. 2) During the image editing phase, a LLM editing controller guides the edits towards specific areas. These edits are then implemented by an attention-modulated diffusion editor, utilizing the fine-tuned model to perform object additions, deletions, replacements, and adjustments. Through extensive experiments, we demonstrate that our framework significantly outperforms existing image editing methods in terms of editing precision and scene aesthetics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11815v1-abstract-full').style.display = 'none'; document.getElementById('2410.11815v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ACM Transactions on Graphics and SIGGRAPH Asia 2024. Project page: https://bestzzhang.github.io/SGEdit</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.10140">arXiv:2410.10140</a> <span> [<a href="https://arxiv.org/pdf/2410.10140">pdf</a>, <a href="https://arxiv.org/format/2410.10140">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Hi-Mamba: Hierarchical Mamba for Efficient Image Super-Resolution </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qiao%2C+J">Junbo Qiao</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+J">Jincheng Liao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wei Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yulun Zhang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Y">Yong Guo</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+Y">Yi Wen</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+Z">Zhangxizi Qiu</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+J">Jiao Xie</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+J">Jie Hu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+S">Shaohui Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.10140v1-abstract-short" style="display: inline;"> State Space Models (SSM), such as Mamba, have shown strong representation ability in modeling long-range dependency with linear complexity, achieving successful applications from high-level to low-level vision tasks. However, SSM's sequential nature necessitates multiple scans in different directions to compensate for the loss of spatial dependency when unfolding the image into a 1D sequence. This… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10140v1-abstract-full').style.display = 'inline'; document.getElementById('2410.10140v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.10140v1-abstract-full" style="display: none;"> State Space Models (SSM), such as Mamba, have shown strong representation ability in modeling long-range dependency with linear complexity, achieving successful applications from high-level to low-level vision tasks. However, SSM's sequential nature necessitates multiple scans in different directions to compensate for the loss of spatial dependency when unfolding the image into a 1D sequence. This multi-direction scanning strategy significantly increases the computation overhead and is unbearable for high-resolution image processing. To address this problem, we propose a novel Hierarchical Mamba network, namely, Hi-Mamba, for image super-resolution (SR). Hi-Mamba consists of two key designs: (1) The Hierarchical Mamba Block (HMB) assembled by a Local SSM (L-SSM) and a Region SSM (R-SSM) both with the single-direction scanning, aggregates multi-scale representations to enhance the context modeling ability. (2) The Direction Alternation Hierarchical Mamba Group (DA-HMG) allocates the isomeric single-direction scanning into cascading HMBs to enrich the spatial relationship modeling. Extensive experiments demonstrate the superiority of Hi-Mamba across five benchmark datasets for efficient SR. For example, Hi-Mamba achieves a significant PSNR improvement of 0.29 dB on Manga109 for $\times3$ SR, compared to the strong lightweight MambaIR. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10140v1-abstract-full').style.display = 'none'; document.getElementById('2410.10140v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.09713">arXiv:2410.09713</a> <span> [<a href="https://arxiv.org/pdf/2410.09713">pdf</a>, <a href="https://arxiv.org/format/2410.09713">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Agentic Information Retrieval </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Weinan Zhang</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+J">Junwei Liao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+N">Ning Li</a>, <a href="/search/cs?searchtype=author&query=Du%2C+K">Kounianhua Du</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+J">Jianghao Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.09713v4-abstract-short" style="display: inline;"> Since the 1970s, information retrieval (IR) has long been defined as the process of acquiring relevant information items from a pre-defined corpus to satisfy user information needs. Traditional IR systems, while effective in domains like web search, are constrained by their reliance on static, pre-defined information items. To this end, this paper introduces agentic information retrieval (Agentic… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09713v4-abstract-full').style.display = 'inline'; document.getElementById('2410.09713v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.09713v4-abstract-full" style="display: none;"> Since the 1970s, information retrieval (IR) has long been defined as the process of acquiring relevant information items from a pre-defined corpus to satisfy user information needs. Traditional IR systems, while effective in domains like web search, are constrained by their reliance on static, pre-defined information items. To this end, this paper introduces agentic information retrieval (Agentic IR), a transformative next-generation paradigm for IR driven by large language models (LLMs) and AI agents. The central shift in agentic IR is the evolving definition of ``information'' from static, pre-defined information items to dynamic, context-dependent information states. Information state refers to a particular information context that the user is right in within a dynamic environment, encompassing not only the acquired information items but also real-time user preferences, contextual factors, and decision-making processes. In such a way, traditional information retrieval, focused on acquiring relevant information items based on user queries, can be naturally extended to achieving the target information state given the user instruction, which thereby defines the agentic information retrieval. We systematically discuss agentic IR from various aspects, i.e., task formulation, architecture, evaluation, case studies, as well as challenges and future prospects. We believe that the concept of agentic IR introduced in this paper not only broadens the scope of information retrieval research but also lays the foundation for a more adaptive, interactive, and intelligent next-generation IR paradigm. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09713v4-abstract-full').style.display = 'none'; document.getElementById('2410.09713v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, perspective paper</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.08877">arXiv:2410.08877</a> <span> [<a href="https://arxiv.org/pdf/2410.08877">pdf</a>, <a href="https://arxiv.org/format/2410.08877">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> Interdependency Matters: Graph Alignment for Multivariate Time Series Anomaly Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yuanyi Wang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+H">Haifeng Sun</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chengsen Wang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+M">Mengde Zhu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jingyu Wang</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+W">Wei Tang</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+Q">Qi Qi</a>, <a href="/search/cs?searchtype=author&query=Zhuang%2C+Z">Zirui Zhuang</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+J">Jianxin Liao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.08877v1-abstract-short" style="display: inline;"> Anomaly detection in multivariate time series (MTS) is crucial for various applications in data mining and industry. Current industrial methods typically approach anomaly detection as an unsupervised learning task, aiming to identify deviations by estimating the normal distribution in noisy, label-free datasets. These methods increasingly incorporate interdependencies between channels through grap… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08877v1-abstract-full').style.display = 'inline'; document.getElementById('2410.08877v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.08877v1-abstract-full" style="display: none;"> Anomaly detection in multivariate time series (MTS) is crucial for various applications in data mining and industry. Current industrial methods typically approach anomaly detection as an unsupervised learning task, aiming to identify deviations by estimating the normal distribution in noisy, label-free datasets. These methods increasingly incorporate interdependencies between channels through graph structures to enhance accuracy. However, the role of interdependencies is more critical than previously understood, as shifts in interdependencies between MTS channels from normal to anomalous data are significant. This observation suggests that \textit{anomalies could be detected by changes in these interdependency graph series}. To capitalize on this insight, we introduce MADGA (MTS Anomaly Detection via Graph Alignment), which redefines anomaly detection as a graph alignment (GA) problem that explicitly utilizes interdependencies for anomaly detection. MADGA dynamically transforms subsequences into graphs to capture the evolving interdependencies, and Graph alignment is performed between these graphs, optimizing an alignment plan that minimizes cost, effectively minimizing the distance for normal data and maximizing it for anomalous data. Uniquely, our GA approach involves explicit alignment of both nodes and edges, employing Wasserstein distance for nodes and Gromov-Wasserstein distance for edges. To our knowledge, this is the first application of GA to MTS anomaly detection that explicitly leverages interdependency for this purpose. Extensive experiments on diverse real-world datasets validate the effectiveness of MADGA, demonstrating its capability to detect anomalies and differentiate interdependencies, consistently achieving state-of-the-art across various scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08877v1-abstract-full').style.display = 'none'; document.getElementById('2410.08877v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.06943">arXiv:2410.06943</a> <span> [<a href="https://arxiv.org/pdf/2410.06943">pdf</a>, <a href="https://arxiv.org/format/2410.06943">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> AutoFeedback: An LLM-based Framework for Efficient and Accurate API Request Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+H">Huanxi Liu</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+J">Jiaqi Liao</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+D">Dawei Feng</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+K">Kele Xu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Huaimin Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.06943v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) leverage external tools primarily through generating the API request to enhance task completion efficiency. The accuracy of API request generation significantly determines the capability of LLMs to accomplish tasks. Due to the inherent hallucinations within the LLM, it is difficult to efficiently and accurately generate the correct API request. Current research use… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06943v1-abstract-full').style.display = 'inline'; document.getElementById('2410.06943v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.06943v1-abstract-full" style="display: none;"> Large Language Models (LLMs) leverage external tools primarily through generating the API request to enhance task completion efficiency. The accuracy of API request generation significantly determines the capability of LLMs to accomplish tasks. Due to the inherent hallucinations within the LLM, it is difficult to efficiently and accurately generate the correct API request. Current research uses prompt-based feedback to facilitate the LLM-based API request generation. However, existing methods lack factual information and are insufficiently detailed. To address these issues, we propose AutoFeedback, an LLM-based framework for efficient and accurate API request generation, with a Static Scanning Component (SSC) and a Dynamic Analysis Component (DAC). SSC incorporates errors detected in the API requests as pseudo-facts into the feedback, enriching the factual information. DAC retrieves information from API documentation, enhancing the level of detail in feedback. Based on this two components, Autofeedback implementes two feedback loops during the process of generating API requests by the LLM. Extensive experiments demonstrate that it significantly improves accuracy of API request generation and reduces the interaction cost. AutoFeedback achieves an accuracy of 100.00\% on a real-world API dataset and reduces the cost of interaction with GPT-3.5 Turbo by 23.44\%, and GPT-4 Turbo by 11.85\%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06943v1-abstract-full').style.display = 'none'; document.getElementById('2410.06943v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.05363">arXiv:2410.05363</a> <span> [<a href="https://arxiv.org/pdf/2410.05363">pdf</a>, <a href="https://arxiv.org/format/2410.05363">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Towards World Simulator: Crafting Physical Commonsense-Based Benchmark for Video Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Meng%2C+F">Fanqing Meng</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+J">Jiaqi Liao</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+X">Xinyu Tan</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+W">Wenqi Shao</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Q">Quanfeng Lu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kaipeng Zhang</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+Y">Yu Cheng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Dianqi Li</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+Y">Yu Qiao</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+P">Ping Luo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.05363v1-abstract-short" style="display: inline;"> Text-to-video (T2V) models like Sora have made significant strides in visualizing complex prompts, which is increasingly viewed as a promising path towards constructing the universal world simulator. Cognitive psychologists believe that the foundation for achieving this goal is the ability to understand intuitive physics. However, the capacity of these models to accurately represent intuitive phys… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05363v1-abstract-full').style.display = 'inline'; document.getElementById('2410.05363v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.05363v1-abstract-full" style="display: none;"> Text-to-video (T2V) models like Sora have made significant strides in visualizing complex prompts, which is increasingly viewed as a promising path towards constructing the universal world simulator. Cognitive psychologists believe that the foundation for achieving this goal is the ability to understand intuitive physics. However, the capacity of these models to accurately represent intuitive physics remains largely unexplored. To bridge this gap, we introduce PhyGenBench, a comprehensive \textbf{Phy}sics \textbf{Gen}eration \textbf{Ben}chmark designed to evaluate physical commonsense correctness in T2V generation. PhyGenBench comprises 160 carefully crafted prompts across 27 distinct physical laws, spanning four fundamental domains, which could comprehensively assesses models' understanding of physical commonsense. Alongside PhyGenBench, we propose a novel evaluation framework called PhyGenEval. This framework employs a hierarchical evaluation structure utilizing appropriate advanced vision-language models and large language models to assess physical commonsense. Through PhyGenBench and PhyGenEval, we can conduct large-scale automated assessments of T2V models' understanding of physical commonsense, which align closely with human feedback. Our evaluation results and in-depth analysis demonstrate that current models struggle to generate videos that comply with physical commonsense. Moreover, simply scaling up models or employing prompt engineering techniques is insufficient to fully address the challenges presented by PhyGenBench (e.g., dynamic scenarios). We hope this study will inspire the community to prioritize the learning of physical commonsense in these models beyond entertainment applications. We will release the data and codes at https://github.com/OpenGVLab/PhyGenBench <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05363v1-abstract-full').style.display = 'none'; document.getElementById('2410.05363v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project Page: https://phygenbench123.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.04587">arXiv:2410.04587</a> <span> [<a href="https://arxiv.org/pdf/2410.04587">pdf</a>, <a href="https://arxiv.org/format/2410.04587">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> Hammer: Robust Function-Calling for On-Device Language Models via Function Masking </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+Q">Qiqiang Lin</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+M">Muning Wen</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+Q">Qiuying Peng</a>, <a href="/search/cs?searchtype=author&query=Nie%2C+G">Guanyu Nie</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+J">Junwei Liao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jun Wang</a>, <a href="/search/cs?searchtype=author&query=Mo%2C+X">Xiaoyun Mo</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jiamu Zhou</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+C">Cheng Cheng</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yin Zhao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jun Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Weinan Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.04587v2-abstract-short" style="display: inline;"> Large language models have demonstrated impressive value in performing as autonomous agents when equipped with external tools and API calls. Nonetheless, effectively harnessing their potential for executing complex tasks crucially relies on enhancements in their function calling capabilities. This paper identifies a critical gap in existing function calling models, where performance varies signifi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04587v2-abstract-full').style.display = 'inline'; document.getElementById('2410.04587v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.04587v2-abstract-full" style="display: none;"> Large language models have demonstrated impressive value in performing as autonomous agents when equipped with external tools and API calls. Nonetheless, effectively harnessing their potential for executing complex tasks crucially relies on enhancements in their function calling capabilities. This paper identifies a critical gap in existing function calling models, where performance varies significantly across benchmarks, often due to being misled by specific naming conventions. To address such an issue, we introduce Hammer, a novel family of foundation models specifically engineered for on-device function calling. Hammer employs an augmented dataset that enhances models' sensitivity to irrelevant functions and incorporates function masking techniques to minimize misleading. Our empirical evaluations reveal that Hammer not only outperforms larger models but also demonstrates robust generalization across diverse benchmarks, achieving sota results. Our open source contributions include a specialized dataset for irrelevance detection, a tuning framework for enhanced generalization, and the Hammer models, establishing a new standard for function calling performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04587v2-abstract-full').style.display = 'none'; document.getElementById('2410.04587v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.03554">arXiv:2410.03554</a> <span> </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optics">physics.optics</span> </div> </div> <p class="title is-5 mathjax"> Artificial intelligence inspired freeform optics design: a review </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Feng%2C+L">Lei Feng</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+J">Jingxing Liao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jingna Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.03554v2-abstract-short" style="display: inline;"> Integrating artificial intelligence (AI) techniques such as machine learning and deep learning into freeform optics design has significantly enhanced design efficiency, expanded the design space, and led to innovative solutions. This article reviews the latest developments in AI applications within this field, highlighting their roles in initial design generation, optimization, and performance pre… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03554v2-abstract-full').style.display = 'inline'; document.getElementById('2410.03554v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.03554v2-abstract-full" style="display: none;"> Integrating artificial intelligence (AI) techniques such as machine learning and deep learning into freeform optics design has significantly enhanced design efficiency, expanded the design space, and led to innovative solutions. This article reviews the latest developments in AI applications within this field, highlighting their roles in initial design generation, optimization, and performance prediction. It also addresses the benefits of AI, such as improved accuracy and performance, alongside challenges like data requirements, model interpretability, and computational complexity. Despite these challenges, the future of AI in freeform optics design looks promising, with potential advancements in hybrid design methods, interpretable AI, AI-driven manufacturing, and targeted research for specific applications. Collaboration among researchers, engineers, and designers is essential to fully harness AI's potential and drive innovation in optics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03554v2-abstract-full').style.display = 'none'; document.getElementById('2410.03554v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Realizing that the manuscript requires substantial revisions that cannot be addressed through minor updates</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.02587">arXiv:2410.02587</a> <span> [<a href="https://arxiv.org/pdf/2410.02587">pdf</a>, <a href="https://arxiv.org/format/2410.02587">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Numerical Analysis">math.NA</span> </div> </div> <p class="title is-5 mathjax"> An Improved Variational Method for Image Denoising </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jing-En Huang</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+J">Jia-Wei Liao</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+K">Ku-Te Lin</a>, <a href="/search/cs?searchtype=author&query=Tsai%2C+Y">Yu-Ju Tsai</a>, <a href="/search/cs?searchtype=author&query=Yueh%2C+M">Mei-Heng Yueh</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.02587v1-abstract-short" style="display: inline;"> The total variation (TV) method is an image denoising technique that aims to reduce noise by minimizing the total variation of the image, which measures the variation in pixel intensities. The TV method has been widely applied in image processing and computer vision for its ability to preserve edges and enhance image quality. In this paper, we propose an improved TV model for image denoising and t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02587v1-abstract-full').style.display = 'inline'; document.getElementById('2410.02587v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.02587v1-abstract-full" style="display: none;"> The total variation (TV) method is an image denoising technique that aims to reduce noise by minimizing the total variation of the image, which measures the variation in pixel intensities. The TV method has been widely applied in image processing and computer vision for its ability to preserve edges and enhance image quality. In this paper, we propose an improved TV model for image denoising and the associated numerical algorithm to carry out the procedure, which is particularly effective in removing several types of noises and their combinations. Our improved model admits a unique solution and the associated numerical algorithm guarantees the convergence. Numerical experiments are demonstrated to show improved effectiveness and denoising quality compared to other TV models. Such encouraging results further enhance the utility of the TV method in image processing. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02587v1-abstract-full').style.display = 'none'; document.getElementById('2410.02587v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.18696">arXiv:2409.18696</a> <span> [<a href="https://arxiv.org/pdf/2409.18696">pdf</a>, <a href="https://arxiv.org/format/2409.18696">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Rethinking the Power of Timestamps for Robust Time Series Forecasting: A Global-Local Fusion Perspective </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chengsen Wang</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+Q">Qi Qi</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jingyu Wang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+H">Haifeng Sun</a>, <a href="/search/cs?searchtype=author&query=Zhuang%2C+Z">Zirui Zhuang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jinming Wu</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+J">Jianxin Liao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.18696v3-abstract-short" style="display: inline;"> Time series forecasting has played a pivotal role across various industries, including finance, transportation, energy, healthcare, and climate. Due to the abundant seasonal information they contain, timestamps possess the potential to offer robust global guidance for forecasting techniques. However, existing works primarily focus on local observations, with timestamps being treated merely as an o… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.18696v3-abstract-full').style.display = 'inline'; document.getElementById('2409.18696v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.18696v3-abstract-full" style="display: none;"> Time series forecasting has played a pivotal role across various industries, including finance, transportation, energy, healthcare, and climate. Due to the abundant seasonal information they contain, timestamps possess the potential to offer robust global guidance for forecasting techniques. However, existing works primarily focus on local observations, with timestamps being treated merely as an optional supplement that remains underutilized. When data gathered from the real world is polluted, the absence of global information will damage the robust prediction capability of these algorithms. To address these problems, we propose a novel framework named GLAFF. Within this framework, the timestamps are modeled individually to capture the global dependencies. Working as a plugin, GLAFF adaptively adjusts the combined weights for global and local information, enabling seamless collaboration with any time series forecasting backbone. Extensive experiments conducted on nine real-world datasets demonstrate that GLAFF significantly enhances the average performance of widely used mainstream forecasting models by 12.5%, surpassing the previous state-of-the-art method by 5.5%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.18696v3-abstract-full').style.display = 'none'; document.getElementById('2409.18696v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.18170">arXiv:2409.18170</a> <span> [<a href="https://arxiv.org/pdf/2409.18170">pdf</a>, <a href="https://arxiv.org/format/2409.18170">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Evaluation of Large Language Models for Summarization Tasks in the Medical Domain: A Narrative Review </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Croxford%2C+E">Emma Croxford</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yanjun Gao</a>, <a href="/search/cs?searchtype=author&query=Pellegrino%2C+N">Nicholas Pellegrino</a>, <a href="/search/cs?searchtype=author&query=Wong%2C+K+K">Karen K. Wong</a>, <a href="/search/cs?searchtype=author&query=Wills%2C+G">Graham Wills</a>, <a href="/search/cs?searchtype=author&query=First%2C+E">Elliot First</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+F+J">Frank J. Liao</a>, <a href="/search/cs?searchtype=author&query=Goswami%2C+C">Cherodeep Goswami</a>, <a href="/search/cs?searchtype=author&query=Patterson%2C+B">Brian Patterson</a>, <a href="/search/cs?searchtype=author&query=Afshar%2C+M">Majid Afshar</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.18170v1-abstract-short" style="display: inline;"> Large Language Models have advanced clinical Natural Language Generation, creating opportunities to manage the volume of medical text. However, the high-stakes nature of medicine requires reliable evaluation, which remains a challenge. In this narrative review, we assess the current evaluation state for clinical summarization tasks and propose future directions to address the resource constraints… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.18170v1-abstract-full').style.display = 'inline'; document.getElementById('2409.18170v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.18170v1-abstract-full" style="display: none;"> Large Language Models have advanced clinical Natural Language Generation, creating opportunities to manage the volume of medical text. However, the high-stakes nature of medicine requires reliable evaluation, which remains a challenge. In this narrative review, we assess the current evaluation state for clinical summarization tasks and propose future directions to address the resource constraints of expert human evaluation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.18170v1-abstract-full').style.display = 'none'; document.getElementById('2409.18170v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.16938">arXiv:2409.16938</a> <span> [<a href="https://arxiv.org/pdf/2409.16938">pdf</a>, <a href="https://arxiv.org/format/2409.16938">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> Generative Object Insertion in Gaussian Splatting with a Multi-View Diffusion Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhong%2C+H">Hongliang Zhong</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Can Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jingbo Zhang</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+J">Jing Liao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.16938v1-abstract-short" style="display: inline;"> Generating and inserting new objects into 3D content is a compelling approach for achieving versatile scene recreation. Existing methods, which rely on SDS optimization or single-view inpainting, often struggle to produce high-quality results. To address this, we propose a novel method for object insertion in 3D content represented by Gaussian Splatting. Our approach introduces a multi-view diffus… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.16938v1-abstract-full').style.display = 'inline'; document.getElementById('2409.16938v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.16938v1-abstract-full" style="display: none;"> Generating and inserting new objects into 3D content is a compelling approach for achieving versatile scene recreation. Existing methods, which rely on SDS optimization or single-view inpainting, often struggle to produce high-quality results. To address this, we propose a novel method for object insertion in 3D content represented by Gaussian Splatting. Our approach introduces a multi-view diffusion model, dubbed MVInpainter, which is built upon a pre-trained stable video diffusion model to facilitate view-consistent object inpainting. Within MVInpainter, we incorporate a ControlNet-based conditional injection module to enable controlled and more predictable multi-view generation. After generating the multi-view inpainted results, we further propose a mask-aware 3D reconstruction technique to refine Gaussian Splatting reconstruction from these sparse inpainted views. By leveraging these fabricate techniques, our approach yields diverse results, ensures view-consistent and harmonious insertions, and produces better object quality. Extensive experiments demonstrate that our approach outperforms existing methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.16938v1-abstract-full').style.display = 'none'; document.getElementById('2409.16938v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project Page: https://github.com/JiuTongBro/MultiView_Inpaint</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.12960">arXiv:2409.12960</a> <span> [<a href="https://arxiv.org/pdf/2409.12960">pdf</a>, <a href="https://arxiv.org/format/2409.12960">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3687910">10.1145/3687910 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> LVCD: Reference-based Lineart Video Colorization with Diffusion Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+Z">Zhitong Huang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Mohan Zhang</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+J">Jing Liao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.12960v1-abstract-short" style="display: inline;"> We propose the first video diffusion framework for reference-based lineart video colorization. Unlike previous works that rely solely on image generative models to colorize lineart frame by frame, our approach leverages a large-scale pretrained video diffusion model to generate colorized animation videos. This approach leads to more temporally consistent results and is better equipped to handle la… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12960v1-abstract-full').style.display = 'inline'; document.getElementById('2409.12960v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.12960v1-abstract-full" style="display: none;"> We propose the first video diffusion framework for reference-based lineart video colorization. Unlike previous works that rely solely on image generative models to colorize lineart frame by frame, our approach leverages a large-scale pretrained video diffusion model to generate colorized animation videos. This approach leads to more temporally consistent results and is better equipped to handle large motions. Firstly, we introduce Sketch-guided ControlNet which provides additional control to finetune an image-to-video diffusion model for controllable video synthesis, enabling the generation of animation videos conditioned on lineart. We then propose Reference Attention to facilitate the transfer of colors from the reference frame to other frames containing fast and expansive motions. Finally, we present a novel scheme for sequential sampling, incorporating the Overlapped Blending Module and Prev-Reference Attention, to extend the video diffusion model beyond its original fixed-length limitation for long video colorization. Both qualitative and quantitative results demonstrate that our method significantly outperforms state-of-the-art techniques in terms of frame and video quality, as well as temporal consistency. Moreover, our method is capable of generating high-quality, long temporal-consistent animation videos with large motions, which is not achievable in previous works. Our code and model are available at https://luckyhzt.github.io/lvcd. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12960v1-abstract-full').style.display = 'none'; document.getElementById('2409.12960v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ACM Transactions on Graphics and SIGGRAPH Asia 2024. Project page: https://luckyhzt.github.io/lvcd</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68U05 (Primary) <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.3.3; I.3.6 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.08481">arXiv:2409.08481</a> <span> [<a href="https://arxiv.org/pdf/2409.08481">pdf</a>, <a href="https://arxiv.org/format/2409.08481">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> USTC-TD: A Test Dataset and Benchmark for Image and Video Coding in 2020s </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhuoyuan Li</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+J">Junqi Liao</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+C">Chuanbo Tang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Haotian Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yuqi Li</a>, <a href="/search/cs?searchtype=author&query=Bian%2C+Y">Yifan Bian</a>, <a href="/search/cs?searchtype=author&query=Sheng%2C+X">Xihua Sheng</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+X">Xinmin Feng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yao Li</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+C">Changsheng Gao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Li Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+D">Dong Liu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+F">Feng Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.08481v2-abstract-short" style="display: inline;"> Image/video coding has been a remarkable research area for both academia and industry for many years. Testing datasets, especially high-quality image/video datasets are desirable for the justified evaluation of coding-related research, practical applications, and standardization activities. We put forward a test dataset namely USTC-TD, which has been successfully adopted in the practical end-to-en… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08481v2-abstract-full').style.display = 'inline'; document.getElementById('2409.08481v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.08481v2-abstract-full" style="display: none;"> Image/video coding has been a remarkable research area for both academia and industry for many years. Testing datasets, especially high-quality image/video datasets are desirable for the justified evaluation of coding-related research, practical applications, and standardization activities. We put forward a test dataset namely USTC-TD, which has been successfully adopted in the practical end-to-end image/video coding challenge of the IEEE International Conference on Visual Communications and lmage Processing (VCIP) in 2022 and 2023. USTC-TD contains 40 images at 4K spatial resolution and 10 video sequences at 1080p spatial resolution, featuring various content due to the diverse environmental factors (e.g. scene type, texture, motion, view) and the designed imaging factors (e.g. illumination, lens, shadow). We quantitatively evaluate USTC-TD on different image/video features (spatial, temporal, color, lightness), and compare it with the previous image/video test datasets, which verifies the wider coverage and more diversity of the proposed dataset. We also evaluate both classic standardized and recent learned image/video coding schemes on USTC-TD with PSNR and MS-SSIM, and provide an extensive benchmark for the evaluated schemes. Based on the characteristics and specific design of the proposed test dataset, we analyze the benchmark performance and shed light on the future research and development of image/video coding. All the data are released online: https://esakak.github.io/USTC-TD . <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08481v2-abstract-full').style.display = 'none'; document.getElementById('2409.08481v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">23 pages. Project Page: https://esakak.github.io/USTC-TD</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.06355">arXiv:2409.06355</a> <span> [<a href="https://arxiv.org/pdf/2409.06355">pdf</a>, <a href="https://arxiv.org/format/2409.06355">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> DiffQRCoder: Diffusion-based Aesthetic QR Code Generation with Scanning Robustness Guided Iterative Refinement </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liao%2C+J">Jia-Wei Liao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Winston Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+T">Tzu-Sian Wang</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+L">Li-Xuan Peng</a>, <a href="/search/cs?searchtype=author&query=Weng%2C+J">Ju-Hsuan Weng</a>, <a href="/search/cs?searchtype=author&query=Chou%2C+C">Cheng-Fu Chou</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jun-Cheng Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.06355v3-abstract-short" style="display: inline;"> With the success of Diffusion Models for image generation, the technologies also have revolutionized the aesthetic Quick Response (QR) code generation. Despite significant improvements in visual attractiveness for the beautified codes, their scannabilities are usually sacrificed and thus hinder their practical uses in real-world scenarios. To address this issue, we propose a novel training-free Di… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.06355v3-abstract-full').style.display = 'inline'; document.getElementById('2409.06355v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.06355v3-abstract-full" style="display: none;"> With the success of Diffusion Models for image generation, the technologies also have revolutionized the aesthetic Quick Response (QR) code generation. Despite significant improvements in visual attractiveness for the beautified codes, their scannabilities are usually sacrificed and thus hinder their practical uses in real-world scenarios. To address this issue, we propose a novel training-free Diffusion-based QR Code generator (DiffQRCoder) to effectively craft both scannable and visually pleasing QR codes. The proposed approach introduces Scanning-Robust Perceptual Guidance (SRPG), a new diffusion guidance for Diffusion Models to guarantee the generated aesthetic codes to obey the ground-truth QR codes while maintaining their attractiveness during the denoising process. Additionally, we present another post-processing technique, Scanning Robust Manifold Projected Gradient Descent (SR-MPGD), to further enhance their scanning robustness through iterative latent space optimization. With extensive experiments, the results demonstrate that our approach not only outperforms other compared methods in Scanning Success Rate (SSR) with better or comparable CLIP aesthetic score (CLIP-aes.) but also significantly improves the SSR of the ControlNet-only approach from 60% to 99%. The subjective evaluation indicates that our approach achieves promising visual attractiveness to users as well. Finally, even with different scanning angles and the most rigorous error tolerance settings, our approach robustly achieves over 95% SSR, demonstrating its capability for real-world applications. Our project page is available at https://jwliao1209.github.io/DiffQRCoder. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.06355v3-abstract-full').style.display = 'none'; document.getElementById('2409.06355v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.05425">arXiv:2409.05425</a> <span> [<a href="https://arxiv.org/pdf/2409.05425">pdf</a>, <a href="https://arxiv.org/format/2409.05425">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Distribution Discrepancy and Feature Heterogeneity for Active 3D Object Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+H">Huang-Yu Chen</a>, <a href="/search/cs?searchtype=author&query=Yeh%2C+J">Jia-Fong Yeh</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+J">Jia-Wei Liao</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+P">Pin-Hsuan Peng</a>, <a href="/search/cs?searchtype=author&query=Hsu%2C+W+H">Winston H. Hsu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.05425v2-abstract-short" style="display: inline;"> LiDAR-based 3D object detection is a critical technology for the development of autonomous driving and robotics. However, the high cost of data annotation limits its advancement. We propose a novel and effective active learning (AL) method called Distribution Discrepancy and Feature Heterogeneity (DDFH), which simultaneously considers geometric features and model embeddings, assessing information… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.05425v2-abstract-full').style.display = 'inline'; document.getElementById('2409.05425v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.05425v2-abstract-full" style="display: none;"> LiDAR-based 3D object detection is a critical technology for the development of autonomous driving and robotics. However, the high cost of data annotation limits its advancement. We propose a novel and effective active learning (AL) method called Distribution Discrepancy and Feature Heterogeneity (DDFH), which simultaneously considers geometric features and model embeddings, assessing information from both the instance-level and frame-level perspectives. Distribution Discrepancy evaluates the difference and novelty of instances within the unlabeled and labeled distributions, enabling the model to learn efficiently with limited data. Feature Heterogeneity ensures the heterogeneity of intra-frame instance features, maintaining feature diversity while avoiding redundant or similar instances, thus minimizing annotation costs. Finally, multiple indicators are efficiently aggregated using Quantile Transform, providing a unified measure of informativeness. Extensive experiments demonstrate that DDFH outperforms the current state-of-the-art (SOTA) methods on the KITTI and Waymo datasets, effectively reducing the bounding box annotation cost by 56.3% and showing robustness when working with both one-stage and two-stage models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.05425v2-abstract-full').style.display = 'none'; document.getElementById('2409.05425v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to CoRL 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.12615">arXiv:2408.12615</a> <span> [<a href="https://arxiv.org/pdf/2408.12615">pdf</a>, <a href="https://arxiv.org/format/2408.12615">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Pediatric TSC-Related Epilepsy Classification from Clinical MR Images Using Quantum Neural Network </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+L">Ling Lin</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yihang Zhou</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Z">Zhanqi Hu</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+D">Dian Jiang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+C">Congcong Liu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+S">Shuo Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yanjie Zhu</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+J">Jianxiang Liao</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+D">Dong Liang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+H">Hairong Zheng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haifeng Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.12615v2-abstract-short" style="display: inline;"> Tuberous sclerosis complex (TSC) manifests as a multisystem disorder with significant neurological implications. This study addresses the critical need for robust classification models tailored to TSC in pediatric patients, introducing QResNet,a novel deep learning model seamlessly integrating conventional convolutional neural networks with quantum neural networks. The model incorporates a two-lay… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.12615v2-abstract-full').style.display = 'inline'; document.getElementById('2408.12615v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.12615v2-abstract-full" style="display: none;"> Tuberous sclerosis complex (TSC) manifests as a multisystem disorder with significant neurological implications. This study addresses the critical need for robust classification models tailored to TSC in pediatric patients, introducing QResNet,a novel deep learning model seamlessly integrating conventional convolutional neural networks with quantum neural networks. The model incorporates a two-layer quantum layer (QL), comprising ZZFeatureMap and Ansatz layers, strategically designed for processing classical data within a quantum framework. A comprehensive evaluation, demonstrates the superior performance of QResNet in TSC MRI image classification compared to conventional 3D-ResNet models. These compelling findings underscore the potential of quantum computing to revolutionize medical imaging and diagnostics.Remarkably, this method surpasses conventional CNNs in accuracy and Area Under the Curve (AUC) metrics with the current dataset. Future research endeavors may focus on exploring the scalability and practical implementation of quantum algorithms in real-world medical imaging scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.12615v2-abstract-full').style.display = 'none'; document.getElementById('2408.12615v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages,4 figures,2 tables,presented at ISBI 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.11810">arXiv:2408.11810</a> <span> [<a href="https://arxiv.org/pdf/2408.11810">pdf</a>, <a href="https://arxiv.org/format/2408.11810">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Pixel Is Not a Barrier: An Effective Evasion Attack for Pixel-Domain Diffusion Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shih%2C+C">Chun-Yen Shih</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+L">Li-Xuan Peng</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+J">Jia-Wei Liao</a>, <a href="/search/cs?searchtype=author&query=Chu%2C+E">Ernie Chu</a>, <a href="/search/cs?searchtype=author&query=Chou%2C+C">Cheng-Fu Chou</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jun-Cheng Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.11810v3-abstract-short" style="display: inline;"> Diffusion Models have emerged as powerful generative models for high-quality image synthesis, with many subsequent image editing techniques based on them. However, the ease of text-based image editing introduces significant risks, such as malicious editing for scams or intellectual property infringement. Previous works have attempted to safeguard images from diffusion-based editing by adding imper… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.11810v3-abstract-full').style.display = 'inline'; document.getElementById('2408.11810v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.11810v3-abstract-full" style="display: none;"> Diffusion Models have emerged as powerful generative models for high-quality image synthesis, with many subsequent image editing techniques based on them. However, the ease of text-based image editing introduces significant risks, such as malicious editing for scams or intellectual property infringement. Previous works have attempted to safeguard images from diffusion-based editing by adding imperceptible perturbations. These methods are costly and specifically target prevalent Latent Diffusion Models (LDMs), while Pixel-domain Diffusion Models (PDMs) remain largely unexplored and robust against such attacks. Our work addresses this gap by proposing a novel attack framework, AtkPDM. AtkPDM is mainly composed of a feature representation attacking loss that exploits vulnerabilities in denoising UNets and a latent optimization strategy to enhance the naturalness of adversarial images. Extensive experiments demonstrate the effectiveness of our approach in attacking dominant PDM-based editing methods (e.g., SDEdit) while maintaining reasonable fidelity and robustness against common defense methods. Additionally, our framework is extensible to LDMs, achieving comparable performance to existing approaches. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.11810v3-abstract-full').style.display = 'none'; document.getElementById('2408.11810v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.11278">arXiv:2408.11278</a> <span> [<a href="https://arxiv.org/pdf/2408.11278">pdf</a>, <a href="https://arxiv.org/format/2408.11278">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> The Key of Parameter Skew in Federated Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+S">Sifan Wang</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+J">Junfeng Liao</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Ye Yuan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Riquan Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.11278v1-abstract-short" style="display: inline;"> Federated Learning (FL) has emerged as an excellent solution for performing deep learning on different data owners without exchanging raw data. However, statistical heterogeneity in FL presents a key challenge, leading to a phenomenon of skewness in local model parameter distributions that researchers have largely overlooked. In this work, we propose the concept of parameter skew to describe the p… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.11278v1-abstract-full').style.display = 'inline'; document.getElementById('2408.11278v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.11278v1-abstract-full" style="display: none;"> Federated Learning (FL) has emerged as an excellent solution for performing deep learning on different data owners without exchanging raw data. However, statistical heterogeneity in FL presents a key challenge, leading to a phenomenon of skewness in local model parameter distributions that researchers have largely overlooked. In this work, we propose the concept of parameter skew to describe the phenomenon that can substantially affect the accuracy of global model parameter estimation. Additionally, we introduce FedSA, an aggregation strategy to obtain a high-quality global model, to address the implication from parameter skew. Specifically, we categorize parameters into high-dispersion and low-dispersion groups based on the coefficient of variation. For high-dispersion parameters, Micro-Classes (MIC) and Macro-Classes (MAC) represent the dispersion at the micro and macro levels, respectively, forming the foundation of FedSA. To evaluate the effectiveness of FedSA, we conduct extensive experiments with different FL algorithms on three computer vision datasets. FedSA outperforms eight state-of-the-art baselines by about 4.7% in test accuracy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.11278v1-abstract-full').style.display = 'none'; document.getElementById('2408.11278v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.10136">arXiv:2408.10136</a> <span> [<a href="https://arxiv.org/pdf/2408.10136">pdf</a>, <a href="https://arxiv.org/format/2408.10136">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Statistics Theory">math.ST</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Methodology">stat.ME</span> </div> </div> <p class="title is-5 mathjax"> Robust spectral clustering with rank statistics </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cape%2C+J">Joshua Cape</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+X">Xianshi Yu</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+J+Z">Jonquil Z. Liao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.10136v2-abstract-short" style="display: inline;"> This paper analyzes the statistical performance of a robust spectral clustering method for latent structure recovery in noisy data matrices. We consider eigenvector-based clustering applied to a matrix of nonparametric rank statistics that is derived entrywise from the raw, original data matrix. This approach is robust in the sense that, unlike traditional spectral clustering procedures, it can pr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10136v2-abstract-full').style.display = 'inline'; document.getElementById('2408.10136v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.10136v2-abstract-full" style="display: none;"> This paper analyzes the statistical performance of a robust spectral clustering method for latent structure recovery in noisy data matrices. We consider eigenvector-based clustering applied to a matrix of nonparametric rank statistics that is derived entrywise from the raw, original data matrix. This approach is robust in the sense that, unlike traditional spectral clustering procedures, it can provably recover population-level latent block structure even when the observed data matrix includes heavy-tailed entries and has a heterogeneous variance profile. Our main theoretical contributions are threefold and hold under flexible data generating conditions. First, we establish that robust spectral clustering with rank statistics can consistently recover latent block structure, viewed as communities of nodes in a graph, in the sense that unobserved community memberships for all but a vanishing fraction of nodes are correctly recovered with high probability when the data matrix is large. Second, we refine the former result and further establish that, under certain conditions, the community membership of any individual, specified node of interest can be asymptotically exactly recovered with probability tending to one in the large-data limit. Third, we establish asymptotic normality results associated with the truncated eigenstructure of matrices whose entries are rank statistics, made possible by synthesizing contemporary entrywise matrix perturbation analysis with the classical nonparametric theory of so-called simple linear rank statistics. Collectively, these results demonstrate the statistical utility of rank-based data transformations when paired with spectral techniques for dimensionality reduction. Additionally, for a dataset of human connectomes, our approach yields parsimonious dimensionality reduction and improved recovery of ground-truth neuroanatomical cluster structure. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10136v2-abstract-full').style.display = 'none'; document.getElementById('2408.10136v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">81 pages, 8 figures, 1 table</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 62H12; 62H30; 62G35 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.08902">arXiv:2408.08902</a> <span> [<a href="https://arxiv.org/pdf/2408.08902">pdf</a>, <a href="https://arxiv.org/format/2408.08902">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Audit-LLM: Multi-Agent Collaboration for Log-based Insider Threat Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Song%2C+C">Chengyu Song</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+L">Linru Ma</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+J">Jianming Zheng</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+J">Jinzhi Liao</a>, <a href="/search/cs?searchtype=author&query=Kuang%2C+H">Hongyu Kuang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+L">Lin Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.08902v1-abstract-short" style="display: inline;"> Log-based insider threat detection (ITD) detects malicious user activities by auditing log entries. Recently, large language models (LLMs) with strong common sense knowledge have emerged in the domain of ITD. Nevertheless, diverse activity types and overlong log files pose a significant challenge for LLMs in directly discerning malicious ones within myriads of normal activities. Furthermore, the f… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.08902v1-abstract-full').style.display = 'inline'; document.getElementById('2408.08902v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.08902v1-abstract-full" style="display: none;"> Log-based insider threat detection (ITD) detects malicious user activities by auditing log entries. Recently, large language models (LLMs) with strong common sense knowledge have emerged in the domain of ITD. Nevertheless, diverse activity types and overlong log files pose a significant challenge for LLMs in directly discerning malicious ones within myriads of normal activities. Furthermore, the faithfulness hallucination issue from LLMs aggravates its application difficulty in ITD, as the generated conclusion may not align with user commands and activity context. In response to these challenges, we introduce Audit-LLM, a multi-agent log-based insider threat detection framework comprising three collaborative agents: (i) the Decomposer agent, breaking down the complex ITD task into manageable sub-tasks using Chain-of-Thought (COT) reasoning;(ii) the Tool Builder agent, creating reusable tools for sub-tasks to overcome context length limitations in LLMs; and (iii) the Executor agent, generating the final detection conclusion by invoking constructed tools. To enhance conclusion accuracy, we propose a pair-wise Evidence-based Multi-agent Debate (EMAD) mechanism, where two independent Executors iteratively refine their conclusions through reasoning exchange to reach a consensus. Comprehensive experiments conducted on three publicly available ITD datasets-CERT r4.2, CERT r5.2, and PicoDomain-demonstrate the superiority of our method over existing baselines and show that the proposed EMAD significantly improves the faithfulness of explanations generated by LLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.08902v1-abstract-full').style.display = 'none'; document.getElementById('2408.08902v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.08056">arXiv:2408.08056</a> <span> [<a href="https://arxiv.org/pdf/2408.08056">pdf</a>, <a href="https://arxiv.org/format/2408.08056">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> DATTA: Towards Diversity Adaptive Test-Time Adaptation in Dynamic Wild World </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ye%2C+C">Chuyang Ye</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+D">Dongyan Wei</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhendong Liu</a>, <a href="/search/cs?searchtype=author&query=Pang%2C+Y">Yuanyi Pang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yixi Lin</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+J">Jiarong Liao</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Q">Qinting Jiang</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+X">Xianghua Fu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Q">Qing Li</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+J">Jingyan Jiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.08056v1-abstract-short" style="display: inline;"> Test-time adaptation (TTA) effectively addresses distribution shifts between training and testing data by adjusting models on test samples, which is crucial for improving model inference in real-world applications. However, traditional TTA methods typically follow a fixed pattern to address the dynamic data patterns (low-diversity or high-diversity patterns) often leading to performance degradatio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.08056v1-abstract-full').style.display = 'inline'; document.getElementById('2408.08056v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.08056v1-abstract-full" style="display: none;"> Test-time adaptation (TTA) effectively addresses distribution shifts between training and testing data by adjusting models on test samples, which is crucial for improving model inference in real-world applications. However, traditional TTA methods typically follow a fixed pattern to address the dynamic data patterns (low-diversity or high-diversity patterns) often leading to performance degradation and consequently a decline in Quality of Experience (QoE). The primary issues we observed are:Different scenarios require different normalization methods (e.g., Instance Normalization is optimal in mixed domains but not in static domains). Model fine-tuning can potentially harm the model and waste time.Hence, it is crucial to design strategies for effectively measuring and managing distribution diversity to minimize its negative impact on model performance. Based on these observations, this paper proposes a new general method, named Diversity Adaptive Test-Time Adaptation (DATTA), aimed at improving QoE. DATTA dynamically selects the best batch normalization methods and fine-tuning strategies by leveraging the Diversity Score to differentiate between high and low diversity score batches. It features three key components: Diversity Discrimination (DD) to assess batch diversity, Diversity Adaptive Batch Normalization (DABN) to tailor normalization methods based on DD insights, and Diversity Adaptive Fine-Tuning (DAFT) to selectively fine-tune the model. Experimental results show that our method achieves up to a 21% increase in accuracy compared to state-of-the-art methodologies, indicating that our method maintains good model performance while demonstrating its robustness. Our code will be released soon. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.08056v1-abstract-full').style.display = 'none'; document.getElementById('2408.08056v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">16 pages, 2 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.02718">arXiv:2408.02718</a> <span> [<a href="https://arxiv.org/pdf/2408.02718">pdf</a>, <a href="https://arxiv.org/format/2408.02718">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MMIU: Multimodal Multi-image Understanding for Evaluating Large Vision-Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Meng%2C+F">Fanqing Meng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jin Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chuanhao Li</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Q">Quanfeng Lu</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+H">Hao Tian</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+J">Jiaqi Liao</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+X">Xizhou Zhu</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+J">Jifeng Dai</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+Y">Yu Qiao</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+P">Ping Luo</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kaipeng Zhang</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+W">Wenqi Shao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.02718v1-abstract-short" style="display: inline;"> The capability to process multiple images is crucial for Large Vision-Language Models (LVLMs) to develop a more thorough and nuanced understanding of a scene. Recent multi-image LVLMs have begun to address this need. However, their evaluation has not kept pace with their development. To fill this gap, we introduce the Multimodal Multi-image Understanding (MMIU) benchmark, a comprehensive evaluatio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.02718v1-abstract-full').style.display = 'inline'; document.getElementById('2408.02718v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.02718v1-abstract-full" style="display: none;"> The capability to process multiple images is crucial for Large Vision-Language Models (LVLMs) to develop a more thorough and nuanced understanding of a scene. Recent multi-image LVLMs have begun to address this need. However, their evaluation has not kept pace with their development. To fill this gap, we introduce the Multimodal Multi-image Understanding (MMIU) benchmark, a comprehensive evaluation suite designed to assess LVLMs across a wide range of multi-image tasks. MMIU encompasses 7 types of multi-image relationships, 52 tasks, 77K images, and 11K meticulously curated multiple-choice questions, making it the most extensive benchmark of its kind. Our evaluation of 24 popular LVLMs, including both open-source and proprietary models, reveals significant challenges in multi-image comprehension, particularly in tasks involving spatial understanding. Even the most advanced models, such as GPT-4o, achieve only 55.7% accuracy on MMIU. Through multi-faceted analytical experiments, we identify key performance gaps and limitations, providing valuable insights for future model and data improvements. We aim for MMIU to advance the frontier of LVLM research and development, moving us toward achieving sophisticated multimodal multi-image user interactions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.02718v1-abstract-full').style.display = 'none'; document.getElementById('2408.02718v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project Page: https://mmiu-bench.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.00305">arXiv:2408.00305</a> <span> [<a href="https://arxiv.org/pdf/2408.00305">pdf</a>, <a href="https://arxiv.org/format/2408.00305">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3664647.3681677">10.1145/3664647.3681677 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Leveraging Weak Cross-Modal Guidance for Coherence Modelling via Iterative Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bin%2C+Y">Yi Bin</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+J">Junrong Liao</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+Y">Yujuan Ding</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Haoxuan Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yang Yang</a>, <a href="/search/cs?searchtype=author&query=Ng%2C+S">See-Kiong Ng</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+H+T">Heng Tao Shen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.00305v1-abstract-short" style="display: inline;"> Cross-modal coherence modeling is essential for intelligent systems to help them organize and structure information, thereby understanding and creating content of the physical world coherently like human-beings. Previous work on cross-modal coherence modeling attempted to leverage the order information from another modality to assist the coherence recovering of the target modality. Despite of the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.00305v1-abstract-full').style.display = 'inline'; document.getElementById('2408.00305v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.00305v1-abstract-full" style="display: none;"> Cross-modal coherence modeling is essential for intelligent systems to help them organize and structure information, thereby understanding and creating content of the physical world coherently like human-beings. Previous work on cross-modal coherence modeling attempted to leverage the order information from another modality to assist the coherence recovering of the target modality. Despite of the effectiveness, labeled associated coherency information is not always available and might be costly to acquire, making the cross-modal guidance hard to leverage. To tackle this challenge, this paper explores a new way to take advantage of cross-modal guidance without gold labels on coherency, and proposes the Weak Cross-Modal Guided Ordering (WeGO) model. More specifically, it leverages high-confidence predicted pairwise order in one modality as reference information to guide the coherence modeling in another. An iterative learning paradigm is further designed to jointly optimize the coherence modeling in two modalities with selected guidance from each other. The iterative cross-modal boosting also functions in inference to further enhance coherence prediction in each modality. Experimental results on two public datasets have demonstrated that the proposed method outperforms existing methods for cross-modal coherence modeling tasks. Major technical modules have been evaluated effective through ablation studies. Codes are available at: \url{https://github.com/scvready123/IterWeGO}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.00305v1-abstract-full').style.display = 'none'; document.getElementById('2408.00305v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ACM Multimedia 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.21705">arXiv:2407.21705</a> <span> [<a href="https://arxiv.org/pdf/2407.21705">pdf</a>, <a href="https://arxiv.org/format/2407.21705">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Tora: Trajectory-oriented Diffusion Transformer for Video Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhenghao Zhang</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+J">Junchao Liao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Menghao Li</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+Z">Zuozhuo Dai</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+B">Bingxue Qiu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+S">Siyu Zhu</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+L">Long Qin</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Weizhi Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.21705v3-abstract-short" style="display: inline;"> Recent advancements in Diffusion Transformer (DiT) have demonstrated remarkable proficiency in producing high-quality video content. Nonetheless, the potential of transformer-based diffusion models for effectively generating videos with controllable motion remains an area of limited exploration. This paper introduces Tora, the first trajectory-oriented DiT framework that concurrently integrates te… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.21705v3-abstract-full').style.display = 'inline'; document.getElementById('2407.21705v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.21705v3-abstract-full" style="display: none;"> Recent advancements in Diffusion Transformer (DiT) have demonstrated remarkable proficiency in producing high-quality video content. Nonetheless, the potential of transformer-based diffusion models for effectively generating videos with controllable motion remains an area of limited exploration. This paper introduces Tora, the first trajectory-oriented DiT framework that concurrently integrates textual, visual, and trajectory conditions, thereby enabling scalable video generation with effective motion guidance. Specifically, Tora consists of a Trajectory Extractor(TE), a Spatial-Temporal DiT, and a Motion-guidance Fuser(MGF). The TE encodes arbitrary trajectories into hierarchical spacetime motion patches with a 3D video compression network. The MGF integrates the motion patches into the DiT blocks to generate consistent videos that accurately follow designated trajectories. Our design aligns seamlessly with DiT's scalability, allowing precise control of video content's dynamics with diverse durations, aspect ratios, and resolutions. Extensive experiments demonstrate Tora's excellence in achieving high motion fidelity, while also meticulously simulating the intricate movement of the physical world. Code is available at: https://github.com/alibaba/Tora. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.21705v3-abstract-full').style.display = 'none'; document.getElementById('2407.21705v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.21333">arXiv:2407.21333</a> <span> [<a href="https://arxiv.org/pdf/2407.21333">pdf</a>, <a href="https://arxiv.org/format/2407.21333">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Chat2Layout: Interactive 3D Furniture Layout with a Multimodal LLM </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+C">Can Wang</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+H">Hongliang Zhong</a>, <a href="/search/cs?searchtype=author&query=Chai%2C+M">Menglei Chai</a>, <a href="/search/cs?searchtype=author&query=He%2C+M">Mingming He</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+D">Dongdong Chen</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+J">Jing Liao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.21333v1-abstract-short" style="display: inline;"> Automatic furniture layout is long desired for convenient interior design. Leveraging the remarkable visual reasoning capabilities of multimodal large language models (MLLMs), recent methods address layout generation in a static manner, lacking the feedback-driven refinement essential for interactive user engagement. We introduce Chat2Layout, a novel interactive furniture layout generation system… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.21333v1-abstract-full').style.display = 'inline'; document.getElementById('2407.21333v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.21333v1-abstract-full" style="display: none;"> Automatic furniture layout is long desired for convenient interior design. Leveraging the remarkable visual reasoning capabilities of multimodal large language models (MLLMs), recent methods address layout generation in a static manner, lacking the feedback-driven refinement essential for interactive user engagement. We introduce Chat2Layout, a novel interactive furniture layout generation system that extends the functionality of MLLMs into the realm of interactive layout design. To achieve this, we establish a unified vision-question paradigm for in-context learning, enabling seamless communication with MLLMs to steer their behavior without altering model weights. Within this framework, we present a novel training-free visual prompting mechanism. This involves a visual-text prompting technique that assist MLLMs in reasoning about plausible layout plans, followed by an Offline-to-Online search (O2O-Search) method, which automatically identifies the minimal set of informative references to provide exemplars for visual-text prompting. By employing an agent system with MLLMs as the core controller, we enable bidirectional interaction. The agent not only comprehends the 3D environment and user requirements through linguistic and visual perception but also plans tasks and reasons about actions to generate and arrange furniture within the virtual space. Furthermore, the agent iteratively updates based on visual feedback from execution results. Experimental results demonstrate that our approach facilitates language-interactive generation and arrangement for diverse and complex 3D furniture. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.21333v1-abstract-full').style.display = 'none'; document.getElementById('2407.21333v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Main paper with supplemental materials</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.07871">arXiv:2407.07871</a> <span> [<a href="https://arxiv.org/pdf/2407.07871">pdf</a>, <a href="https://arxiv.org/format/2407.07871">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Enhancing HNSW Index for Real-Time Updates: Addressing Unreachable Points and Performance Degradation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiao%2C+W">Wentao Xiao</a>, <a href="/search/cs?searchtype=author&query=Zhan%2C+Y">Yueyang Zhan</a>, <a href="/search/cs?searchtype=author&query=Xi%2C+R">Rui Xi</a>, <a href="/search/cs?searchtype=author&query=Hou%2C+M">Mengshu Hou</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+J">Jianming Liao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.07871v2-abstract-short" style="display: inline;"> The approximate nearest neighbor search (ANNS) is a fundamental and essential component in data mining and information retrieval, with graph-based methodologies demonstrating superior performance compared to alternative approaches. Extensive research efforts have been dedicated to improving search efficiency by developing various graph-based indices, such as HNSW (Hierarchical Navigable Small Worl… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.07871v2-abstract-full').style.display = 'inline'; document.getElementById('2407.07871v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.07871v2-abstract-full" style="display: none;"> The approximate nearest neighbor search (ANNS) is a fundamental and essential component in data mining and information retrieval, with graph-based methodologies demonstrating superior performance compared to alternative approaches. Extensive research efforts have been dedicated to improving search efficiency by developing various graph-based indices, such as HNSW (Hierarchical Navigable Small World). However, the performance of HNSW and most graph-based indices become unacceptable when faced with a large number of real-time deletions, insertions, and updates. Furthermore, during update operations, HNSW can result in some data points becoming unreachable, a situation we refer to as the `unreachable points phenomenon'. This phenomenon could significantly affect the search accuracy of the graph in certain situations. To address these issues, we present efficient measures to overcome the shortcomings of HNSW, specifically addressing poor performance over long periods of delete and update operations and resolving the issues caused by the unreachable points phenomenon. Our proposed MN-RU algorithm effectively improves update efficiency and suppresses the growth rate of unreachable points, ensuring better overall performance and maintaining the integrity of the graph. Our results demonstrate that our methods outperform existing approaches. Furthermore, since our methods are based on HNSW, they can be easily integrated with existing indices widely used in the industrial field, making them practical for future real-world applications. Code is available at \url{https://github.com/xwt1/MN-RU.git} <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.07871v2-abstract-full').style.display = 'none'; document.getElementById('2407.07871v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.07410">arXiv:2407.07410</a> <span> [<a href="https://arxiv.org/pdf/2407.07410">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Mutual Information calculation on different appearances </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liao%2C+J">Jiecheng Liao</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+J">Junhao Lu</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+J">Jeff Ji</a>, <a href="/search/cs?searchtype=author&query=He%2C+J">Jiacheng He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.07410v1-abstract-short" style="display: inline;"> Mutual information has many applications in image alignment and matching, mainly due to its ability to measure the statistical dependence between two images, even if the two images are from different modalities (e.g., CT and MRI). It considers not only the pixel intensities of the images but also the spatial relationships between the pixels. In this project, we apply the mutual information formula… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.07410v1-abstract-full').style.display = 'inline'; document.getElementById('2407.07410v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.07410v1-abstract-full" style="display: none;"> Mutual information has many applications in image alignment and matching, mainly due to its ability to measure the statistical dependence between two images, even if the two images are from different modalities (e.g., CT and MRI). It considers not only the pixel intensities of the images but also the spatial relationships between the pixels. In this project, we apply the mutual information formula to image matching, where image A is the moving object and image B is the target object and calculate the mutual information between them to evaluate the similarity between the images. For comparison, we also used entropy and information-gain methods to test the dependency of the images. We also investigated the effect of different environments on the mutual information of the same image and used experiments and plots to demonstrate. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.07410v1-abstract-full').style.display = 'none'; document.getElementById('2407.07410v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">demo for the work: elucidator.cn/demo-mi/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.07111">arXiv:2407.07111</a> <span> [<a href="https://arxiv.org/pdf/2407.07111">pdf</a>, <a href="https://arxiv.org/format/2407.07111">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> Diffusion Model-Based Video Editing: A Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+W">Wenhao Sun</a>, <a href="/search/cs?searchtype=author&query=Tu%2C+R">Rong-Cheng Tu</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+J">Jingyi Liao</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+D">Dacheng Tao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.07111v1-abstract-short" style="display: inline;"> The rapid development of diffusion models (DMs) has significantly advanced image and video applications, making "what you want is what you see" a reality. Among these, video editing has gained substantial attention and seen a swift rise in research activity, necessitating a comprehensive and systematic review of the existing literature. This paper reviews diffusion model-based video editing techni… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.07111v1-abstract-full').style.display = 'inline'; document.getElementById('2407.07111v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.07111v1-abstract-full" style="display: none;"> The rapid development of diffusion models (DMs) has significantly advanced image and video applications, making "what you want is what you see" a reality. Among these, video editing has gained substantial attention and seen a swift rise in research activity, necessitating a comprehensive and systematic review of the existing literature. This paper reviews diffusion model-based video editing techniques, including theoretical foundations and practical applications. We begin by overviewing the mathematical formulation and image domain's key methods. Subsequently, we categorize video editing approaches by the inherent connections of their core technologies, depicting evolutionary trajectory. This paper also dives into novel applications, including point-based editing and pose-guided human video editing. Additionally, we present a comprehensive comparison using our newly introduced V2VBench. Building on the progress achieved to date, the paper concludes with ongoing challenges and potential directions for future research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.07111v1-abstract-full').style.display = 'none'; document.getElementById('2407.07111v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">23 pages, 12 figures, a project related to this paper can be found at https://github.com/wenhao728/awesome-diffusion-v2v</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.04923">arXiv:2407.04923</a> <span> [<a href="https://arxiv.org/pdf/2407.04923">pdf</a>, <a href="https://arxiv.org/format/2407.04923">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> OmChat: A Recipe to Train Multimodal Language Models with Strong Long Context and Video Understanding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+T">Tiancheng Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Qianqian Zhang</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+K">Kyusong Lee</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+P">Peng Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Lu Zhang</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+C">Chunxin Fang</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+J">Jiajia Liao</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+K">Kelei Jiang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Y">Yibo Ma</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+R">Ruochen Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.04923v1-abstract-short" style="display: inline;"> We introduce OmChat, a model designed to excel in handling long contexts and video understanding tasks. OmChat's new architecture standardizes how different visual inputs are processed, making it more efficient and adaptable. It uses a dynamic vision encoding process to effectively handle images of various resolutions, capturing fine details across a range of image qualities. OmChat utilizes an ac… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.04923v1-abstract-full').style.display = 'inline'; document.getElementById('2407.04923v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.04923v1-abstract-full" style="display: none;"> We introduce OmChat, a model designed to excel in handling long contexts and video understanding tasks. OmChat's new architecture standardizes how different visual inputs are processed, making it more efficient and adaptable. It uses a dynamic vision encoding process to effectively handle images of various resolutions, capturing fine details across a range of image qualities. OmChat utilizes an active progressive multimodal pretraining strategy, which gradually increases the model's capacity for long contexts and enhances its overall abilities. By selecting high-quality data during training, OmChat learns from the most relevant and informative data points. With support for a context length of up to 512K, OmChat demonstrates promising performance in tasks involving multiple images and videos, outperforming most open-source models in these benchmarks. Additionally, OmChat proposes a prompting strategy for unifying complex multimodal inputs including single image text, multi-image text and videos, and achieving competitive performance on single-image benchmarks. To further evaluate the model's capabilities, we proposed a benchmark dataset named Temporal Visual Needle in a Haystack. This dataset assesses OmChat's ability to comprehend temporal visual details within long videos. Our analysis highlights several key factors contributing to OmChat's success: support for any-aspect high image resolution, the active progressive pretraining strategy, and high-quality supervised fine-tuning datasets. This report provides a detailed overview of OmChat's capabilities and the strategies that enhance its performance in visual understanding. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.04923v1-abstract-full').style.display = 'none'; document.getElementById('2407.04923v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">14 pages</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Liao%2C+J&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Liao%2C+J&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Liao%2C+J&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Liao%2C+J&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Liao%2C+J&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Liao%2C+J&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li> <a href="/search/?searchtype=author&query=Liao%2C+J&start=250" class="pagination-link " aria-label="Page 6" aria-current="page">6 </a> </li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>