Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–47 of 47 results for author: <span class="mathjax">Gui, J</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Gui%2C+J">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Gui, J"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Gui%2C+J&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Gui, J"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06710">arXiv:2502.06710</a> <span> [<a href="https://arxiv.org/pdf/2502.06710">pdf</a>, <a href="https://arxiv.org/format/2502.06710">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Learning Musical Representations for Music Performance Question Answering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Diao%2C+X">Xingjian Diao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chunhui Zhang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+T">Tingxuan Wu</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+M">Ming Cheng</a>, <a href="/search/cs?searchtype=author&query=Ouyang%2C+Z">Zhongyu Ouyang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+W">Weiyi Wu</a>, <a href="/search/cs?searchtype=author&query=Gui%2C+J">Jiang Gui</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06710v1-abstract-short" style="display: inline;"> Music performances are representative scenarios for audio-visual modeling. Unlike common scenarios with sparse audio, music performances continuously involve dense audio signals throughout. While existing multimodal learning methods on the audio-video QA demonstrate impressive capabilities in general scenarios, they are incapable of dealing with fundamental problems within the music performances:… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06710v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06710v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06710v1-abstract-full" style="display: none;"> Music performances are representative scenarios for audio-visual modeling. Unlike common scenarios with sparse audio, music performances continuously involve dense audio signals throughout. While existing multimodal learning methods on the audio-video QA demonstrate impressive capabilities in general scenarios, they are incapable of dealing with fundamental problems within the music performances: they underexplore the interaction between the multimodal signals in performance and fail to consider the distinctive characteristics of instruments and music. Therefore, existing methods tend to answer questions regarding musical performances inaccurately. To bridge the above research gaps, (i) given the intricate multimodal interconnectivity inherent to music data, our primary backbone is designed to incorporate multimodal interactions within the context of music; (ii) to enable the model to learn music characteristics, we annotate and release rhythmic and music sources in the current music datasets; (iii) for time-aware audio-visual modeling, we align the model's music predictions with the temporal dimension. Our experiments show state-of-the-art effects on the Music AVQA datasets. Our code is available at https://github.com/xid32/Amuse. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06710v1-abstract-full').style.display = 'none'; document.getElementById('2502.06710v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at EMNLP 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06020">arXiv:2502.06020</a> <span> [<a href="https://arxiv.org/pdf/2502.06020">pdf</a>, <a href="https://arxiv.org/format/2502.06020">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Temporal Working Memory: Query-Guided Segment Refinement for Enhanced Multimodal Understanding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Diao%2C+X">Xingjian Diao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chunhui Zhang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+W">Weiyi Wu</a>, <a href="/search/cs?searchtype=author&query=Ouyang%2C+Z">Zhongyu Ouyang</a>, <a href="/search/cs?searchtype=author&query=Qing%2C+P">Peijun Qing</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+M">Ming Cheng</a>, <a href="/search/cs?searchtype=author&query=Vosoughi%2C+S">Soroush Vosoughi</a>, <a href="/search/cs?searchtype=author&query=Gui%2C+J">Jiang Gui</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06020v1-abstract-short" style="display: inline;"> Multimodal foundation models (MFMs) have demonstrated significant success in tasks such as visual captioning, question answering, and image-text retrieval. However, these models face inherent limitations due to their finite internal capacity, which restricts their ability to process extended temporal sequences, a crucial requirement for comprehensive video and audio analysis. To overcome these cha… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06020v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06020v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06020v1-abstract-full" style="display: none;"> Multimodal foundation models (MFMs) have demonstrated significant success in tasks such as visual captioning, question answering, and image-text retrieval. However, these models face inherent limitations due to their finite internal capacity, which restricts their ability to process extended temporal sequences, a crucial requirement for comprehensive video and audio analysis. To overcome these challenges, we introduce a specialized cognitive module, temporal working memory (TWM), which aims to enhance the temporal modeling capabilities of MFMs. It selectively retains task-relevant information across temporal dimensions, ensuring that critical details are preserved throughout the processing of video and audio content. The TWM uses a query-guided attention approach to focus on the most informative multimodal segments within temporal sequences. By retaining only the most relevant content, TWM optimizes the use of the model's limited capacity, enhancing its temporal modeling ability. This plug-and-play module can be easily integrated into existing MFMs. With our TWM, nine state-of-the-art models exhibit significant performance improvements across tasks such as video captioning, question answering, and video-text retrieval. By enhancing temporal modeling, TWM extends the capability of MFMs to handle complex, time-sensitive data effectively. Our code is available at https://github.com/xid32/NAACL_2025_TWM. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06020v1-abstract-full').style.display = 'none'; document.getElementById('2502.06020v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at NAACL 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.13795">arXiv:2501.13795</a> <span> [<a href="https://arxiv.org/pdf/2501.13795">pdf</a>, <a href="https://arxiv.org/format/2501.13795">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Training-Free Zero-Shot Temporal Action Detection with Vision-Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Han%2C+C">Chaolei Han</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hongsong Wang</a>, <a href="/search/cs?searchtype=author&query=Kuang%2C+J">Jidong Kuang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Lei Zhang</a>, <a href="/search/cs?searchtype=author&query=Gui%2C+J">Jie Gui</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.13795v1-abstract-short" style="display: inline;"> Existing zero-shot temporal action detection (ZSTAD) methods predominantly use fully supervised or unsupervised strategies to recognize unseen activities. However, these training-based methods are prone to domain shifts and require high computational costs, which hinder their practical applicability in real-world scenarios. In this paper, unlike previous works, we propose a training-Free Zero-shot… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13795v1-abstract-full').style.display = 'inline'; document.getElementById('2501.13795v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.13795v1-abstract-full" style="display: none;"> Existing zero-shot temporal action detection (ZSTAD) methods predominantly use fully supervised or unsupervised strategies to recognize unseen activities. However, these training-based methods are prone to domain shifts and require high computational costs, which hinder their practical applicability in real-world scenarios. In this paper, unlike previous works, we propose a training-Free Zero-shot temporal Action Detection (FreeZAD) method, leveraging existing vision-language (ViL) models to directly classify and localize unseen activities within untrimmed videos without any additional fine-tuning or adaptation. We mitigate the need for explicit temporal modeling and reliance on pseudo-label quality by designing the LOGarithmic decay weighted Outer-Inner-Contrastive Score (LogOIC) and frequency-based Actionness Calibration. Furthermore, we introduce a test-time adaptation (TTA) strategy using Prototype-Centric Sampling (PCS) to expand FreeZAD, enabling ViL models to adapt more effectively for ZSTAD. Extensive experiments on the THUMOS14 and ActivityNet-1.3 datasets demonstrate that our training-free method outperforms state-of-the-art unsupervised methods while requiring only 1/13 of the runtime. When equipped with TTA, the enhanced method further narrows the gap with fully supervised methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13795v1-abstract-full').style.display = 'none'; document.getElementById('2501.13795v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.17372">arXiv:2412.17372</a> <span> [<a href="https://arxiv.org/pdf/2412.17372">pdf</a>, <a href="https://arxiv.org/ps/2412.17372">ps</a>, <a href="https://arxiv.org/format/2412.17372">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> Outage Probability Analysis of Uplink Heterogeneous Non-terrestrial Networks: A Novel Stochastic Geometry Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dong%2C+W">Wen-Yu Dong</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Shaoshi Yang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+W">Wei Lin</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+W">Wei Zhao</a>, <a href="/search/cs?searchtype=author&query=Gui%2C+J">Jia-Xing Gui</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Sheng Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.17372v1-abstract-short" style="display: inline;"> In harsh environments such as mountainous terrain, dense vegetation areas, or urban landscapes, a single type of unmanned aerial vehicles (UAVs) may encounter challenges like flight restrictions, difficulty in task execution, or increased risk. Therefore, employing multiple types of UAVs, along with satellite assistance, to collaborate becomes essential in such scenarios. In this context, we prese… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.17372v1-abstract-full').style.display = 'inline'; document.getElementById('2412.17372v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.17372v1-abstract-full" style="display: none;"> In harsh environments such as mountainous terrain, dense vegetation areas, or urban landscapes, a single type of unmanned aerial vehicles (UAVs) may encounter challenges like flight restrictions, difficulty in task execution, or increased risk. Therefore, employing multiple types of UAVs, along with satellite assistance, to collaborate becomes essential in such scenarios. In this context, we present a stochastic geometry based approach for modeling the heterogeneous non-terrestrial networks (NTNs) by using the classical binomial point process and introducing a novel point process, called Mat{茅}rn hard-core cluster process (MHCCP). Our MHCCP possesses both the exclusivity and the clustering properties, thus it can better model the aircraft group composed of multiple clusters. Then, we derive closed-form expressions of the outage probability (OP) for the uplink (aerial-to-satellite) of heterogeneous NTNs. Unlike existing studies, our analysis relies on a more advanced system configuration, where the integration of beamforming and frequency division multiple access, and the shadowed-Rician (SR) fading model for interference power, are considered. The accuracy of our theoretical derivation is confirmed by Monte Carlo simulations. Our research offers fundamental insights into the system-level performance optimization of NTNs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.17372v1-abstract-full').style.display = 'none'; document.getElementById('2412.17372v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages,6 figures, conference</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> in Proc. 67th IEEE Global Communications Conference (GLOBECOM 2024), Cape Town, South Africa, Dec. 8-12, 2024, pp. 2588-2593 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.17210">arXiv:2412.17210</a> <span> [<a href="https://arxiv.org/pdf/2412.17210">pdf</a>, <a href="https://arxiv.org/format/2412.17210">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Dual Conditioned Motion Diffusion for Pose-Based Video Anomaly Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+A">Andi Xu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hongsong Wang</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+P">Pinle Ding</a>, <a href="/search/cs?searchtype=author&query=Gui%2C+J">Jie Gui</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.17210v1-abstract-short" style="display: inline;"> Video Anomaly Detection (VAD) is essential for computer vision research. Existing VAD methods utilize either reconstruction-based or prediction-based frameworks. The former excels at detecting irregular patterns or structures, whereas the latter is capable of spotting abnormal deviations or trends. We address pose-based video anomaly detection and introduce a novel framework called Dual Conditione… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.17210v1-abstract-full').style.display = 'inline'; document.getElementById('2412.17210v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.17210v1-abstract-full" style="display: none;"> Video Anomaly Detection (VAD) is essential for computer vision research. Existing VAD methods utilize either reconstruction-based or prediction-based frameworks. The former excels at detecting irregular patterns or structures, whereas the latter is capable of spotting abnormal deviations or trends. We address pose-based video anomaly detection and introduce a novel framework called Dual Conditioned Motion Diffusion (DCMD), which enjoys the advantages of both approaches. The DCMD integrates conditioned motion and conditioned embedding to comprehensively utilize the pose characteristics and latent semantics of observed movements, respectively. In the reverse diffusion process, a motion transformer is proposed to capture potential correlations from multi-layered characteristics within the spectrum space of human motion. To enhance the discriminability between normal and abnormal instances, we design a novel United Association Discrepancy (UAD) regularization that primarily relies on a Gaussian kernel-based time association and a self-attention-based global association. Finally, a mask completion strategy is introduced during the inference stage of the reverse diffusion process to enhance the utilization of conditioned motion for the prediction branch of anomaly detection. Extensive experiments on four datasets demonstrate that our method dramatically outperforms state-of-the-art methods and exhibits superior generalization performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.17210v1-abstract-full').style.display = 'none'; document.getElementById('2412.17210v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Code is on https://github.com/guijiejie/DCMD-main</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11006">arXiv:2411.11006</a> <span> [<a href="https://arxiv.org/pdf/2411.11006">pdf</a>, <a href="https://arxiv.org/format/2411.11006">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> BackdoorMBTI: A Backdoor Learning Multimodal Benchmark Tool Kit for Backdoor Defense Evaluation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+H">Haiyang Yu</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+T">Tian Xie</a>, <a href="/search/cs?searchtype=author&query=Gui%2C+J">Jiaping Gui</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+P">Pengyang Wang</a>, <a href="/search/cs?searchtype=author&query=Yi%2C+P">Ping Yi</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yue Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11006v1-abstract-short" style="display: inline;"> We introduce BackdoorMBTI, the first backdoor learning toolkit and benchmark designed for multimodal evaluation across three representative modalities from eleven commonly used datasets. BackdoorMBTI provides a systematic backdoor learning pipeline, encompassing data processing, data poisoning, backdoor training, and evaluation. The generated poison datasets and backdoor models enable detailed eva… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11006v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11006v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11006v1-abstract-full" style="display: none;"> We introduce BackdoorMBTI, the first backdoor learning toolkit and benchmark designed for multimodal evaluation across three representative modalities from eleven commonly used datasets. BackdoorMBTI provides a systematic backdoor learning pipeline, encompassing data processing, data poisoning, backdoor training, and evaluation. The generated poison datasets and backdoor models enable detailed evaluation of backdoor defense methods. Given the diversity of modalities, BackdoorMBTI facilitates systematic evaluation across different data types. Furthermore, BackdoorMBTI offers a standardized approach to handling practical factors in backdoor learning, such as issues related to data quality and erroneous labels. We anticipate that BackdoorMBTI will expedite future research in backdoor defense methods within a multimodal context. Code is available at https://anonymous.4open.science/r/BackdoorMBTI-D6A1/README.md. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11006v1-abstract-full').style.display = 'none'; document.getElementById('2411.11006v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.05500">arXiv:2410.05500</a> <span> [<a href="https://arxiv.org/pdf/2410.05500">pdf</a>, <a href="https://arxiv.org/format/2410.05500">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Residual Kolmogorov-Arnold Network for Enhanced Deep Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+R+C">Ray Congrui Yu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+S">Sherry Wu</a>, <a href="/search/cs?searchtype=author&query=Gui%2C+J">Jiang Gui</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.05500v1-abstract-short" style="display: inline;"> Despite the strong performance in many computer vision tasks, Convolutional Neural Networks (CNNs) can sometimes struggle to efficiently capture long-range, complex non-linear dependencies in deeper layers of the network. We address this limitation by introducing Residual KAN, which incorporates the Kolmogorov-Arnold Network (KAN) within the CNN framework as a residual component. Our approach uses… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05500v1-abstract-full').style.display = 'inline'; document.getElementById('2410.05500v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.05500v1-abstract-full" style="display: none;"> Despite the strong performance in many computer vision tasks, Convolutional Neural Networks (CNNs) can sometimes struggle to efficiently capture long-range, complex non-linear dependencies in deeper layers of the network. We address this limitation by introducing Residual KAN, which incorporates the Kolmogorov-Arnold Network (KAN) within the CNN framework as a residual component. Our approach uses Chebyshev polynomials as the basis for KAN convolutions that enables more expressive and adaptive feature representations while maintaining computational efficiency. The proposed RKAN blocks, when integrated into established architectures such as ResNet and DenseNet, offer consistent improvements over the baseline models on various well-known benchmarks. Our results demonstrate the potential of RKAN to enhance the capabilities of deep CNNs in visual data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05500v1-abstract-full').style.display = 'none'; document.getElementById('2410.05500v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Code is available at https://github.com/withray/residualKAN.git</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.19685">arXiv:2409.19685</a> <span> [<a href="https://arxiv.org/pdf/2409.19685">pdf</a>, <a href="https://arxiv.org/format/2409.19685">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Underwater Organism Color Enhancement via Color Code Decomposition, Adaptation and Interpolation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cong%2C+X">Xiaofeng Cong</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jing Zhang</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+Y">Yeying Jin</a>, <a href="/search/cs?searchtype=author&query=Hou%2C+J">Junming Hou</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yu Zhao</a>, <a href="/search/cs?searchtype=author&query=Gui%2C+J">Jie Gui</a>, <a href="/search/cs?searchtype=author&query=Kwok%2C+J+T">James Tin-Yau Kwok</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Y+Y">Yuan Yan Tang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.19685v1-abstract-short" style="display: inline;"> Underwater images often suffer from quality degradation due to absorption and scattering effects. Most existing underwater image enhancement algorithms produce a single, fixed-color image, limiting user flexibility and application. To address this limitation, we propose a method called \textit{ColorCode}, which enhances underwater images while offering a range of controllable color outputs. Our ap… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19685v1-abstract-full').style.display = 'inline'; document.getElementById('2409.19685v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.19685v1-abstract-full" style="display: none;"> Underwater images often suffer from quality degradation due to absorption and scattering effects. Most existing underwater image enhancement algorithms produce a single, fixed-color image, limiting user flexibility and application. To address this limitation, we propose a method called \textit{ColorCode}, which enhances underwater images while offering a range of controllable color outputs. Our approach involves recovering an underwater image to a reference enhanced image through supervised training and decomposing it into color and content codes via self-reconstruction and cross-reconstruction. The color code is explicitly constrained to follow a Gaussian distribution, allowing for efficient sampling and interpolation during inference. ColorCode offers three key features: 1) color enhancement, producing an enhanced image with a fixed color; 2) color adaptation, enabling controllable adjustments of long-wavelength color components using guidance images; and 3) color interpolation, allowing for the smooth generation of multiple colors through continuous sampling of the color code. Quantitative and visual evaluations on popular and challenging benchmark datasets demonstrate the superiority of ColorCode over existing methods in providing diverse, controllable, and color-realistic enhancement results. The source code is available at https://github.com/Xiaofeng-life/ColorCode. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19685v1-abstract-full').style.display = 'none'; document.getElementById('2409.19685v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.17589">arXiv:2409.17589</a> <span> [<a href="https://arxiv.org/pdf/2409.17589">pdf</a>, <a href="https://arxiv.org/format/2409.17589">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Improving Fast Adversarial Training via Self-Knowledge Guidance </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+C">Chengze Jiang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Junkai Wang</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+M">Minjing Dong</a>, <a href="/search/cs?searchtype=author&query=Gui%2C+J">Jie Gui</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+X">Xinli Shi</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+Y">Yuan Cao</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Y+Y">Yuan Yan Tang</a>, <a href="/search/cs?searchtype=author&query=Kwok%2C+J+T">James Tin-Yau Kwok</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.17589v1-abstract-short" style="display: inline;"> Adversarial training has achieved remarkable advancements in defending against adversarial attacks. Among them, fast adversarial training (FAT) is gaining attention for its ability to achieve competitive robustness with fewer computing resources. Existing FAT methods typically employ a uniform strategy that optimizes all training data equally without considering the influence of different examples… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.17589v1-abstract-full').style.display = 'inline'; document.getElementById('2409.17589v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.17589v1-abstract-full" style="display: none;"> Adversarial training has achieved remarkable advancements in defending against adversarial attacks. Among them, fast adversarial training (FAT) is gaining attention for its ability to achieve competitive robustness with fewer computing resources. Existing FAT methods typically employ a uniform strategy that optimizes all training data equally without considering the influence of different examples, which leads to an imbalanced optimization. However, this imbalance remains unexplored in the field of FAT. In this paper, we conduct a comprehensive study of the imbalance issue in FAT and observe an obvious class disparity regarding their performances. This disparity could be embodied from a perspective of alignment between clean and robust accuracy. Based on the analysis, we mainly attribute the observed misalignment and disparity to the imbalanced optimization in FAT, which motivates us to optimize different training data adaptively to enhance robustness. Specifically, we take disparity and misalignment into consideration. First, we introduce self-knowledge guided regularization, which assigns differentiated regularization weights to each class based on its training state, alleviating class disparity. Additionally, we propose self-knowledge guided label relaxation, which adjusts label relaxation according to the training accuracy, alleviating the misalignment and improving robustness. By combining these methods, we formulate the Self-Knowledge Guided FAT (SKG-FAT), leveraging naturally generated knowledge during training to enhance the adversarial robustness without compromising training efficiency. Extensive experiments on four standard datasets demonstrate that the SKG-FAT improves the robustness and preserves competitive clean accuracy, outperforming the state-of-the-art methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.17589v1-abstract-full').style.display = 'none'; document.getElementById('2409.17589v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.14774">arXiv:2409.14774</a> <span> [<a href="https://arxiv.org/pdf/2409.14774">pdf</a>, <a href="https://arxiv.org/format/2409.14774">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/TIFS.2024.3436528">10.1109/TIFS.2024.3436528 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> CFVNet: An End-to-End Cancelable Finger Vein Network for Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yifan Wang</a>, <a href="/search/cs?searchtype=author&query=Gui%2C+J">Jie Gui</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Y+Y">Yuan Yan Tang</a>, <a href="/search/cs?searchtype=author&query=Kwok%2C+J+T">James Tin-Yau Kwok</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.14774v1-abstract-short" style="display: inline;"> Finger vein recognition technology has become one of the primary solutions for high-security identification systems. However, it still has information leakage problems, which seriously jeopardizes users privacy and anonymity and cause great security risks. In addition, there is no work to consider a fully integrated secure finger vein recognition system. So, different from the previous systems, we… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.14774v1-abstract-full').style.display = 'inline'; document.getElementById('2409.14774v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.14774v1-abstract-full" style="display: none;"> Finger vein recognition technology has become one of the primary solutions for high-security identification systems. However, it still has information leakage problems, which seriously jeopardizes users privacy and anonymity and cause great security risks. In addition, there is no work to consider a fully integrated secure finger vein recognition system. So, different from the previous systems, we integrate preprocessing and template protection into an integrated deep learning model. We propose an end-to-end cancelable finger vein network (CFVNet), which can be used to design an secure finger vein recognition system.It includes a plug-and-play BWR-ROIAlign unit, which consists of three sub-modules: Localization, Compression and Transformation. The localization module achieves automated localization of stable and unique finger vein ROI. The compression module losslessly removes spatial and channel redundancies. The transformation module uses the proposed BWR method to introduce unlinkability, irreversibility and revocability to the system. BWR-ROIAlign can directly plug into the model to introduce the above features for DCNN-based finger vein recognition systems. We perform extensive experiments on four public datasets to study the performance and cancelable biometric attributes of the CFVNet-based recognition system. The average accuracy, EERs and Dsys on the four datasets are 99.82%, 0.01% and 0.025, respectively, and achieves competitive performance compared with the state-of-the-arts. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.14774v1-abstract-full').style.display = 'none'; document.getElementById('2409.14774v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> in IEEE Transactions on Information Forensics and Security, vol. 19, pp. 7810-7823, 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.14336">arXiv:2409.14336</a> <span> [<a href="https://arxiv.org/pdf/2409.14336">pdf</a>, <a href="https://arxiv.org/format/2409.14336">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Zero-Shot Skeleton-based Action Recognition with Dual Visual-Text Alignment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kuang%2C+J">Jidong Kuang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hongsong Wang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+C">Chaolei Han</a>, <a href="/search/cs?searchtype=author&query=Gui%2C+J">Jie Gui</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.14336v1-abstract-short" style="display: inline;"> Zero-shot action recognition, which addresses the issue of scalability and generalization in action recognition and allows the models to adapt to new and unseen actions dynamically, is an important research topic in computer vision communities. The key to zero-shot action recognition lies in aligning visual features with semantic vectors representing action categories. Most existing methods either… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.14336v1-abstract-full').style.display = 'inline'; document.getElementById('2409.14336v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.14336v1-abstract-full" style="display: none;"> Zero-shot action recognition, which addresses the issue of scalability and generalization in action recognition and allows the models to adapt to new and unseen actions dynamically, is an important research topic in computer vision communities. The key to zero-shot action recognition lies in aligning visual features with semantic vectors representing action categories. Most existing methods either directly project visual features onto the semantic space of text category or learn a shared embedding space between the two modalities. However, a direct projection cannot accurately align the two modalities, and learning robust and discriminative embedding space between visual and text representations is often difficult. To address these issues, we introduce Dual Visual-Text Alignment (DVTA) for skeleton-based zero-shot action recognition. The DVTA consists of two alignment modules-Direct Alignment (DA) and Augmented Alignment (AA)-along with a designed Semantic Description Enhancement (SDE). The DA module maps the skeleton features to the semantic space through a specially designed visual projector, followed by the SDE, which is based on cross-attention to enhance the connection between skeleton and text, thereby reducing the gap between modalities. The AA module further strengthens the learning of the embedding space by utilizing deep metric learning to learn the similarity between skeleton and text. Our approach achieves state-of-the-art performances on several popular zero-shot skeleton-based action recognition benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.14336v1-abstract-full').style.display = 'none'; document.getElementById('2409.14336v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.06420">arXiv:2409.06420</a> <span> [<a href="https://arxiv.org/pdf/2409.06420">pdf</a>, <a href="https://arxiv.org/format/2409.06420">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Unrevealed Threats: A Comprehensive Study of the Adversarial Robustness of Underwater Image Enhancement Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhai%2C+S">Siyu Zhai</a>, <a href="/search/cs?searchtype=author&query=He%2C+Z">Zhibo He</a>, <a href="/search/cs?searchtype=author&query=Cong%2C+X">Xiaofeng Cong</a>, <a href="/search/cs?searchtype=author&query=Hou%2C+J">Junming Hou</a>, <a href="/search/cs?searchtype=author&query=Gui%2C+J">Jie Gui</a>, <a href="/search/cs?searchtype=author&query=You%2C+J+W">Jian Wei You</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+X">Xin Gong</a>, <a href="/search/cs?searchtype=author&query=Kwok%2C+J+T">James Tin-Yau Kwok</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Y+Y">Yuan Yan Tang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.06420v1-abstract-short" style="display: inline;"> Learning-based methods for underwater image enhancement (UWIE) have undergone extensive exploration. However, learning-based models are usually vulnerable to adversarial examples so as the UWIE models. To the best of our knowledge, there is no comprehensive study on the adversarial robustness of UWIE models, which indicates that UWIE models are potentially under the threat of adversarial attacks.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.06420v1-abstract-full').style.display = 'inline'; document.getElementById('2409.06420v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.06420v1-abstract-full" style="display: none;"> Learning-based methods for underwater image enhancement (UWIE) have undergone extensive exploration. However, learning-based models are usually vulnerable to adversarial examples so as the UWIE models. To the best of our knowledge, there is no comprehensive study on the adversarial robustness of UWIE models, which indicates that UWIE models are potentially under the threat of adversarial attacks. In this paper, we propose a general adversarial attack protocol. We make a first attempt to conduct adversarial attacks on five well-designed UWIE models on three common underwater image benchmark datasets. Considering the scattering and absorption of light in the underwater environment, there exists a strong correlation between color correction and underwater image enhancement. On the basis of that, we also design two effective UWIE-oriented adversarial attack methods Pixel Attack and Color Shift Attack targeting different color spaces. The results show that five models exhibit varying degrees of vulnerability to adversarial attacks and well-designed small perturbations on degraded images are capable of preventing UWIE models from generating enhanced results. Further, we conduct adversarial training on these models and successfully mitigated the effectiveness of adversarial attacks. In summary, we reveal the adversarial vulnerability of UWIE models and propose a new evaluation dimension of UWIE models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.06420v1-abstract-full').style.display = 'none'; document.getElementById('2409.06420v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.17129">arXiv:2408.17129</a> <span> [<a href="https://arxiv.org/pdf/2408.17129">pdf</a>, <a href="https://arxiv.org/ps/2408.17129">ps</a>, <a href="https://arxiv.org/format/2408.17129">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Controllable Edge-Type-Specific Interpretation in Multi-Relational Graph Neural Networks for Drug Response Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiaodi Li</a>, <a href="/search/cs?searchtype=author&query=Gui%2C+J">Jianfeng Gui</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Q">Qian Gao</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+H">Haoyuan Shi</a>, <a href="/search/cs?searchtype=author&query=Yue%2C+Z">Zhenyu Yue</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.17129v2-abstract-short" style="display: inline;"> Graph Neural Networks have been widely applied in critical decision-making areas that demand interpretable predictions, leading to the flourishing development of interpretability algorithms. However, current graph interpretability algorithms tend to emphasize generality and often overlook biological significance, thereby limiting their applicability in predicting cancer drug responses. In this pap… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.17129v2-abstract-full').style.display = 'inline'; document.getElementById('2408.17129v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.17129v2-abstract-full" style="display: none;"> Graph Neural Networks have been widely applied in critical decision-making areas that demand interpretable predictions, leading to the flourishing development of interpretability algorithms. However, current graph interpretability algorithms tend to emphasize generality and often overlook biological significance, thereby limiting their applicability in predicting cancer drug responses. In this paper, we propose a novel post-hoc interpretability algorithm for cancer drug response prediction, CETExplainer, which incorporates a controllable edge-type-specific weighting mechanism. It considers the mutual information between subgraphs and predictions, proposing a structural scoring approach to provide fine-grained, biologically meaningful explanations for predictive models. We also introduce a method for constructing ground truth based on real-world datasets to quantitatively evaluate the proposed interpretability algorithm. Empirical analysis on the real-world dataset demonstrates that CETExplainer achieves superior stability and improves explanation quality compared to leading algorithms, thereby offering a robust and insightful tool for cancer drug prediction. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.17129v2-abstract-full').style.display = 'none'; document.getElementById('2408.17129v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.15778">arXiv:2408.15778</a> <span> [<a href="https://arxiv.org/pdf/2408.15778">pdf</a>, <a href="https://arxiv.org/format/2408.15778">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> LogicGame: Benchmarking Rule-Based Reasoning Abilities of Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gui%2C+J">Jiayi Gui</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yiming Liu</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+J">Jiale Cheng</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+X">Xiaotao Gu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiao Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hongning Wang</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Y">Yuxiao Dong</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+J">Jie Tang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+M">Minlie Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.15778v4-abstract-short" style="display: inline;"> Large Language Models (LLMs) have demonstrated notable capabilities across various tasks, showcasing complex problem-solving abilities. Understanding and executing complex rules, along with multi-step planning, are fundamental to logical reasoning and critical for practical LLM agents and decision-making systems. However, evaluating LLMs as effective rule-based executors and planners remains under… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.15778v4-abstract-full').style.display = 'inline'; document.getElementById('2408.15778v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.15778v4-abstract-full" style="display: none;"> Large Language Models (LLMs) have demonstrated notable capabilities across various tasks, showcasing complex problem-solving abilities. Understanding and executing complex rules, along with multi-step planning, are fundamental to logical reasoning and critical for practical LLM agents and decision-making systems. However, evaluating LLMs as effective rule-based executors and planners remains underexplored. In this paper, we introduce LogicGame, a novel benchmark designed to evaluate the comprehensive rule understanding, execution, and planning capabilities of LLMs. Unlike traditional benchmarks, LogicGame provides diverse games that contain a series of rules with an initial state, requiring models to comprehend and apply predefined regulations to solve problems. We create simulated scenarios in which models execute or plan operations to achieve specific outcomes. These game scenarios are specifically designed to distinguish logical reasoning from mere knowledge by relying exclusively on predefined rules. This separation allows for a pure assessment of rule-based reasoning capabilities. The evaluation considers not only final outcomes but also intermediate steps, providing a comprehensive assessment of model performance. Moreover, these intermediate steps are deterministic and can be automatically verified. LogicGame defines game scenarios with varying difficulty levels, from simple rule applications to complex reasoning chains, in order to offer a precise evaluation of model performance on rule understanding and multi-step execution. Utilizing LogicGame, we test various LLMs and identify notable shortcomings in their rule-based logical reasoning abilities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.15778v4-abstract-full').style.display = 'none'; document.getElementById('2408.15778v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.03944">arXiv:2408.03944</a> <span> [<a href="https://arxiv.org/pdf/2408.03944">pdf</a>, <a href="https://arxiv.org/format/2408.03944">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Improving Fast Adversarial Training Paradigm: An Example Taxonomy Perspective </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gui%2C+J">Jie Gui</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+C">Chengze Jiang</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+M">Minjing Dong</a>, <a href="/search/cs?searchtype=author&query=Tong%2C+K">Kun Tong</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+X">Xinli Shi</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Y+Y">Yuan Yan Tang</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+D">Dacheng Tao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.03944v2-abstract-short" style="display: inline;"> While adversarial training is an effective defense method against adversarial attacks, it notably increases the training cost. To this end, fast adversarial training (FAT) is presented for efficient training and has become a hot research topic. However, FAT suffers from catastrophic overfitting, which leads to a performance drop compared with multi-step adversarial training. However, the cause of… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.03944v2-abstract-full').style.display = 'inline'; document.getElementById('2408.03944v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.03944v2-abstract-full" style="display: none;"> While adversarial training is an effective defense method against adversarial attacks, it notably increases the training cost. To this end, fast adversarial training (FAT) is presented for efficient training and has become a hot research topic. However, FAT suffers from catastrophic overfitting, which leads to a performance drop compared with multi-step adversarial training. However, the cause of catastrophic overfitting remains unclear and lacks exploration. In this paper, we present an example taxonomy in FAT, which identifies that catastrophic overfitting is caused by the imbalance between the inner and outer optimization in FAT. Furthermore, we investigated the impact of varying degrees of training loss, revealing a correlation between training loss and catastrophic overfitting. Based on these observations, we redesign the loss function in FAT with the proposed dynamic label relaxation to concentrate the loss range and reduce the impact of misclassified examples. Meanwhile, we introduce batch momentum initialization to enhance the diversity to prevent catastrophic overfitting in an efficient manner. Furthermore, we also propose Catastrophic Overfitting aware Loss Adaptation (COLA), which employs a separate training strategy for examples based on their loss degree. Our proposed method, named example taxonomy aware FAT (ETA), establishes an improved paradigm for FAT. Experiment results demonstrate our ETA achieves state-of-the-art performance. Comprehensive experiments on four standard datasets demonstrate the competitiveness of our proposed method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.03944v2-abstract-full').style.display = 'none'; document.getElementById('2408.03944v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.09924">arXiv:2407.09924</a> <span> [<a href="https://arxiv.org/pdf/2407.09924">pdf</a>, <a href="https://arxiv.org/format/2407.09924">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Region-aware Image-based Human Action Retrieval with Transformers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hongsong Wang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+J">Jianhua Zhao</a>, <a href="/search/cs?searchtype=author&query=Gui%2C+J">Jie Gui</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.09924v2-abstract-short" style="display: inline;"> Human action understanding is a fundamental and challenging task in computer vision. Although there exists tremendous research on this area, most works focus on action recognition, while action retrieval has received less attention. In this paper, we focus on the neglected but important task of image-based action retrieval which aims to find images that depict the same action as a query image. We… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.09924v2-abstract-full').style.display = 'inline'; document.getElementById('2407.09924v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.09924v2-abstract-full" style="display: none;"> Human action understanding is a fundamental and challenging task in computer vision. Although there exists tremendous research on this area, most works focus on action recognition, while action retrieval has received less attention. In this paper, we focus on the neglected but important task of image-based action retrieval which aims to find images that depict the same action as a query image. We establish benchmarks for this task and set up important baseline methods for fair comparison. We present an end-to-end model that learns rich action representations from three aspects: the anchored person, contextual regions, and the global image. A novel fusion transformer module is designed to model the relationships among different features and effectively fuse them into an action representation. Experiments on the Stanford-40 and PASCAL VOC 2012 Action datasets show that the proposed method significantly outperforms previous approaches for image-based action retrieval. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.09924v2-abstract-full').style.display = 'none'; document.getElementById('2407.09924v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.12793">arXiv:2406.12793</a> <span> [<a href="https://arxiv.org/pdf/2406.12793">pdf</a>, <a href="https://arxiv.org/format/2406.12793">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> ChatGLM: A Family of Large Language Models from GLM-130B to GLM-4 All Tools </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=GLM%2C+T">Team GLM</a>, <a href="/search/cs?searchtype=author&query=%3A"> :</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+A">Aohan Zeng</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+B">Bin Xu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bowen Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chenhui Zhang</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+D">Da Yin</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+D">Dan Zhang</a>, <a href="/search/cs?searchtype=author&query=Rojas%2C+D">Diego Rojas</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+G">Guanyu Feng</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Hanlin Zhao</a>, <a href="/search/cs?searchtype=author&query=Lai%2C+H">Hanyu Lai</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+H">Hao Yu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hongning Wang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+J">Jiadai Sun</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiajie Zhang</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+J">Jiale Cheng</a>, <a href="/search/cs?searchtype=author&query=Gui%2C+J">Jiayi Gui</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+J">Jie Tang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jing Zhang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+J">Jingyu Sun</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Juanzi Li</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+L">Lei Zhao</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+L">Lindong Wu</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+L">Lucen Zhong</a> , et al. (34 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.12793v2-abstract-short" style="display: inline;"> We introduce ChatGLM, an evolving family of large language models that we have been developing over time. This report primarily focuses on the GLM-4 language series, which includes GLM-4, GLM-4-Air, and GLM-4-9B. They represent our most capable models that are trained with all the insights and lessons gained from the preceding three generations of ChatGLM. To date, the GLM-4 models are pre-trained… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.12793v2-abstract-full').style.display = 'inline'; document.getElementById('2406.12793v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.12793v2-abstract-full" style="display: none;"> We introduce ChatGLM, an evolving family of large language models that we have been developing over time. This report primarily focuses on the GLM-4 language series, which includes GLM-4, GLM-4-Air, and GLM-4-9B. They represent our most capable models that are trained with all the insights and lessons gained from the preceding three generations of ChatGLM. To date, the GLM-4 models are pre-trained on ten trillions of tokens mostly in Chinese and English, along with a small set of corpus from 24 languages, and aligned primarily for Chinese and English usage. The high-quality alignment is achieved via a multi-stage post-training process, which involves supervised fine-tuning and learning from human feedback. Evaluations show that GLM-4 1) closely rivals or outperforms GPT-4 in terms of general metrics such as MMLU, GSM8K, MATH, BBH, GPQA, and HumanEval, 2) gets close to GPT-4-Turbo in instruction following as measured by IFEval, 3) matches GPT-4 Turbo (128K) and Claude 3 for long context tasks, and 4) outperforms GPT-4 in Chinese alignments as measured by AlignBench. The GLM-4 All Tools model is further aligned to understand user intent and autonomously decide when and which tool(s) touse -- including web browser, Python interpreter, text-to-image model, and user-defined functions -- to effectively complete complex tasks. In practical applications, it matches and even surpasses GPT-4 All Tools in tasks like accessing online information via web browsing and solving math problems using Python interpreter. Over the course, we have open-sourced a series of models, including ChatGLM-6B (three generations), GLM-4-9B (128K, 1M), GLM-4V-9B, WebGLM, and CodeGeeX, attracting over 10 million downloads on Hugging face in the year 2023 alone. The open models can be accessed through https://github.com/THUDM and https://huggingface.co/THUDM. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.12793v2-abstract-full').style.display = 'none'; document.getElementById('2406.12793v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.09333">arXiv:2406.09333</a> <span> [<a href="https://arxiv.org/pdf/2406.09333">pdf</a>, <a href="https://arxiv.org/format/2406.09333">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Memory-Efficient Sparse Pyramid Attention Networks for Whole Slide Image Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+W">Weiyi Wu</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+C">Chongyang Gao</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xinwen Xu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Siting Li</a>, <a href="/search/cs?searchtype=author&query=Gui%2C+J">Jiang Gui</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.09333v1-abstract-short" style="display: inline;"> Whole Slide Images (WSIs) are crucial for modern pathological diagnosis, yet their gigapixel-scale resolutions and sparse informative regions pose significant computational challenges. Traditional dense attention mechanisms, widely used in computer vision and natural language processing, are impractical for WSI analysis due to the substantial data scale and the redundant processing of uninformativ… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.09333v1-abstract-full').style.display = 'inline'; document.getElementById('2406.09333v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.09333v1-abstract-full" style="display: none;"> Whole Slide Images (WSIs) are crucial for modern pathological diagnosis, yet their gigapixel-scale resolutions and sparse informative regions pose significant computational challenges. Traditional dense attention mechanisms, widely used in computer vision and natural language processing, are impractical for WSI analysis due to the substantial data scale and the redundant processing of uninformative areas. To address these challenges, we propose Memory-Efficient Sparse Pyramid Attention Networks with Shifted Windows (SPAN), drawing inspiration from state-of-the-art sparse attention techniques in other domains. SPAN introduces a sparse pyramid attention architecture that hierarchically focuses on informative regions within the WSI, aiming to reduce memory overhead while preserving critical features. Additionally, the incorporation of shifted windows enables the model to capture long-range contextual dependencies essential for accurate classification. We evaluated SPAN on multiple public WSI datasets, observing its competitive performance. Unlike existing methods that often struggle to model spatial and contextual information due to memory constraints, our approach enables the accurate modeling of these crucial features. Our study also highlights the importance of key design elements in attention mechanisms, such as the shifted-window scheme and the hierarchical structure, which contribute substantially to the effectiveness of SPAN in WSI analysis. The potential of SPAN for memory-efficient and effective analysis of WSI data is thus demonstrated, and the code will be made publicly available following the publication of this work. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.09333v1-abstract-full').style.display = 'none'; document.getElementById('2406.09333v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.19684">arXiv:2405.19684</a> <span> [<a href="https://arxiv.org/pdf/2405.19684">pdf</a>, <a href="https://arxiv.org/format/2405.19684">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> A Comprehensive Survey on Underwater Image Enhancement Based on Deep Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cong%2C+X">Xiaofeng Cong</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yu Zhao</a>, <a href="/search/cs?searchtype=author&query=Gui%2C+J">Jie Gui</a>, <a href="/search/cs?searchtype=author&query=Hou%2C+J">Junming Hou</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+D">Dacheng Tao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.19684v3-abstract-short" style="display: inline;"> Underwater image enhancement (UIE) presents a significant challenge within computer vision research. Despite the development of numerous UIE algorithms, a thorough and systematic review is still absent. To foster future advancements, we provide a detailed overview of the UIE task from several perspectives. Firstly, we introduce the physical models, data construction processes, evaluation metrics,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.19684v3-abstract-full').style.display = 'inline'; document.getElementById('2405.19684v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.19684v3-abstract-full" style="display: none;"> Underwater image enhancement (UIE) presents a significant challenge within computer vision research. Despite the development of numerous UIE algorithms, a thorough and systematic review is still absent. To foster future advancements, we provide a detailed overview of the UIE task from several perspectives. Firstly, we introduce the physical models, data construction processes, evaluation metrics, and loss functions. Secondly, we categorize and discuss recent algorithms based on their contributions, considering six aspects: network architecture, learning strategy, learning stage, auxiliary tasks, domain perspective, and disentanglement fusion. Thirdly, due to the varying experimental setups in the existing literature, a comprehensive and unbiased comparison is currently unavailable. To address this, we perform both quantitative and qualitative evaluations of state-of-the-art algorithms across multiple benchmark datasets. Lastly, we identify key areas for future research in UIE. A collection of resources for UIE can be found at {https://github.com/YuZhao1999/UIE}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.19684v3-abstract-full').style.display = 'none'; document.getElementById('2405.19684v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">A survey on the underwater image enhancement task</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.19062">arXiv:2405.19062</a> <span> [<a href="https://arxiv.org/pdf/2405.19062">pdf</a>, <a href="https://arxiv.org/format/2405.19062">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> SIG: Efficient Self-Interpretable Graph Neural Network for Continuous-time Dynamic Graphs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fang%2C+L">Lanting Fang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yulian Yang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+K">Kai Wang</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+S">Shanshan Feng</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+K">Kaiyu Feng</a>, <a href="/search/cs?searchtype=author&query=Gui%2C+J">Jie Gui</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shuliang Wang</a>, <a href="/search/cs?searchtype=author&query=Ong%2C+Y">Yew-Soon Ong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.19062v1-abstract-short" style="display: inline;"> While dynamic graph neural networks have shown promise in various applications, explaining their predictions on continuous-time dynamic graphs (CTDGs) is difficult. This paper investigates a new research task: self-interpretable GNNs for CTDGs. We aim to predict future links within the dynamic graph while simultaneously providing causal explanations for these predictions. There are two key challen… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.19062v1-abstract-full').style.display = 'inline'; document.getElementById('2405.19062v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.19062v1-abstract-full" style="display: none;"> While dynamic graph neural networks have shown promise in various applications, explaining their predictions on continuous-time dynamic graphs (CTDGs) is difficult. This paper investigates a new research task: self-interpretable GNNs for CTDGs. We aim to predict future links within the dynamic graph while simultaneously providing causal explanations for these predictions. There are two key challenges: (1) capturing the underlying structural and temporal information that remains consistent across both independent and identically distributed (IID) and out-of-distribution (OOD) data, and (2) efficiently generating high-quality link prediction results and explanations. To tackle these challenges, we propose a novel causal inference model, namely the Independent and Confounded Causal Model (ICCM). ICCM is then integrated into a deep learning architecture that considers both effectiveness and efficiency. Extensive experiments demonstrate that our proposed model significantly outperforms existing methods across link prediction accuracy, explanation quality, and robustness to shortcut features. Our code and datasets are anonymously released at https://github.com/2024SIG/SIG. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.19062v1-abstract-full').style.display = 'none'; document.getElementById('2405.19062v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">19 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.16086">arXiv:2405.16086</a> <span> [<a href="https://arxiv.org/pdf/2405.16086">pdf</a>, <a href="https://arxiv.org/format/2405.16086">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Performance">cs.PF</span> </div> </div> <p class="title is-5 mathjax"> An Experimental Study of Different Aggregation Schemes in Semi-Asynchronous Federated Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yunbo Li</a>, <a href="/search/cs?searchtype=author&query=Gui%2C+J">Jiaping Gui</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yue Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.16086v1-abstract-short" style="display: inline;"> Federated learning is highly valued due to its high-performance computing in distributed environments while safeguarding data privacy. To address resource heterogeneity, researchers have proposed a semi-asynchronous federated learning (SAFL) architecture. However, the performance gap between different aggregation targets in SAFL remain unexplored. In this paper, we systematically compare the per… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.16086v1-abstract-full').style.display = 'inline'; document.getElementById('2405.16086v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.16086v1-abstract-full" style="display: none;"> Federated learning is highly valued due to its high-performance computing in distributed environments while safeguarding data privacy. To address resource heterogeneity, researchers have proposed a semi-asynchronous federated learning (SAFL) architecture. However, the performance gap between different aggregation targets in SAFL remain unexplored. In this paper, we systematically compare the performance between two algorithm modes, FedSGD and FedAvg that correspond to aggregating gradients and models, respectively. Our results across various task scenarios indicate these two modes exhibit a substantial performance gap. Specifically, FedSGD achieves higher accuracy and faster convergence but experiences more severe fluctuates in accuracy, whereas FedAvg excels in handling straggler issues but converges slower with reduced accuracy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.16086v1-abstract-full').style.display = 'none'; document.getElementById('2405.16086v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.13830">arXiv:2404.13830</a> <span> [<a href="https://arxiv.org/pdf/2404.13830">pdf</a>, <a href="https://arxiv.org/format/2404.13830">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Deep Learning-Based Point Cloud Registration: A Comprehensive Survey and Taxonomy </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yu-Xin Zhang</a>, <a href="/search/cs?searchtype=author&query=Gui%2C+J">Jie Gui</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+B">Baosheng Yu</a>, <a href="/search/cs?searchtype=author&query=Cong%2C+X">Xiaofeng Cong</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+X">Xin Gong</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+W">Wenbing Tao</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+D">Dacheng Tao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.13830v3-abstract-short" style="display: inline;"> Point cloud registration involves determining a rigid transformation to align a source point cloud with a target point cloud. This alignment is fundamental in applications such as autonomous driving, robotics, and medical imaging, where precise spatial correspondence is essential. Deep learning has greatly advanced point cloud registration by providing robust and efficient methods that address the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.13830v3-abstract-full').style.display = 'inline'; document.getElementById('2404.13830v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.13830v3-abstract-full" style="display: none;"> Point cloud registration involves determining a rigid transformation to align a source point cloud with a target point cloud. This alignment is fundamental in applications such as autonomous driving, robotics, and medical imaging, where precise spatial correspondence is essential. Deep learning has greatly advanced point cloud registration by providing robust and efficient methods that address the limitations of traditional approaches, including sensitivity to noise, outliers, and initialization. However, a well-constructed taxonomy for these methods is still lacking, making it difficult to systematically classify and compare the various approaches. In this paper, we present a comprehensive survey and taxonomy on deep learning-based point cloud registration (DL-PCR). We begin with a formal description of the point cloud registration problem, followed by an overview of the datasets, evaluation metrics, and loss functions commonly used in DL-PCR. Next, we categorize existing DL-PCR methods into supervised and unsupervised approaches, as they focus on significantly different key aspects. For supervised DL-PCR methods, we organize the discussion based on key aspects, including the registration procedure, optimization strategy, learning paradigm, network enhancement, and integration with traditional methods; For unsupervised DL-PCR methods, we classify them into correspondence-based and correspondence-free approaches, depending on whether they require explicit identification of point-to-point correspondences. To facilitate a more comprehensive and fair comparison, we conduct quantitative evaluations of all recent state-of-the-art approaches, using a unified training setting and consistent data partitioning strategy. Lastly, we highlight the open challenges and discuss potential directions for future study. A comprehensive collection is available at https://github.com/yxzhang15/PCR. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.13830v3-abstract-full').style.display = 'none'; document.getElementById('2404.13830v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.18548">arXiv:2403.18548</a> <span> [<a href="https://arxiv.org/pdf/2403.18548">pdf</a>, <a href="https://arxiv.org/format/2403.18548">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> A Semi-supervised Nighttime Dehazing Baseline with Spatial-Frequency Aware and Realistic Brightness Constraint </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cong%2C+X">Xiaofeng Cong</a>, <a href="/search/cs?searchtype=author&query=Gui%2C+J">Jie Gui</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jing Zhang</a>, <a href="/search/cs?searchtype=author&query=Hou%2C+J">Junming Hou</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+H">Hao Shen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.18548v1-abstract-short" style="display: inline;"> Existing research based on deep learning has extensively explored the problem of daytime image dehazing. However, few studies have considered the characteristics of nighttime hazy scenes. There are two distinctions between nighttime and daytime haze. First, there may be multiple active colored light sources with lower illumination intensity in nighttime scenes, which may cause haze, glow and noise… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.18548v1-abstract-full').style.display = 'inline'; document.getElementById('2403.18548v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.18548v1-abstract-full" style="display: none;"> Existing research based on deep learning has extensively explored the problem of daytime image dehazing. However, few studies have considered the characteristics of nighttime hazy scenes. There are two distinctions between nighttime and daytime haze. First, there may be multiple active colored light sources with lower illumination intensity in nighttime scenes, which may cause haze, glow and noise with localized, coupled and frequency inconsistent characteristics. Second, due to the domain discrepancy between simulated and real-world data, unrealistic brightness may occur when applying a dehazing model trained on simulated data to real-world data. To address the above two issues, we propose a semi-supervised model for real-world nighttime dehazing. First, the spatial attention and frequency spectrum filtering are implemented as a spatial-frequency domain information interaction module to handle the first issue. Second, a pseudo-label-based retraining strategy and a local window-based brightness loss for semi-supervised training process is designed to suppress haze and glow while achieving realistic brightness. Experiments on public benchmarks validate the effectiveness of the proposed method and its superiority over state-of-the-art methods. The source code and Supplementary Materials are placed in the https://github.com/Xiaofeng-life/SFSNiD. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.18548v1-abstract-full').style.display = 'none'; document.getElementById('2403.18548v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper is accepted by CVPR2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.05675">arXiv:2306.05675</a> <span> [<a href="https://arxiv.org/pdf/2306.05675">pdf</a>, <a href="https://arxiv.org/format/2306.05675">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Illumination Controllable Dehazing Network based on Unsupervised Retinex Embedding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gui%2C+J">Jie Gui</a>, <a href="/search/cs?searchtype=author&query=Cong%2C+X">Xiaofeng Cong</a>, <a href="/search/cs?searchtype=author&query=He%2C+L">Lei He</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Y+Y">Yuan Yan Tang</a>, <a href="/search/cs?searchtype=author&query=Kwok%2C+J+T">James Tin-Yau Kwok</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.05675v1-abstract-short" style="display: inline;"> On the one hand, the dehazing task is an illposedness problem, which means that no unique solution exists. On the other hand, the dehazing task should take into account the subjective factor, which is to give the user selectable dehazed images rather than a single result. Therefore, this paper proposes a multi-output dehazing network by introducing illumination controllable ability, called IC-Deha… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.05675v1-abstract-full').style.display = 'inline'; document.getElementById('2306.05675v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.05675v1-abstract-full" style="display: none;"> On the one hand, the dehazing task is an illposedness problem, which means that no unique solution exists. On the other hand, the dehazing task should take into account the subjective factor, which is to give the user selectable dehazed images rather than a single result. Therefore, this paper proposes a multi-output dehazing network by introducing illumination controllable ability, called IC-Dehazing. The proposed IC-Dehazing can change the illumination intensity by adjusting the factor of the illumination controllable module, which is realized based on the interpretable Retinex theory. Moreover, the backbone dehazing network of IC-Dehazing consists of a Transformer with double decoders for high-quality image restoration. Further, the prior-based loss function and unsupervised training strategy enable IC-Dehazing to complete the parameter learning process without the need for paired data. To demonstrate the effectiveness of the proposed IC-Dehazing, quantitative and qualitative experiments are conducted on image dehazing, semantic segmentation, and object detection tasks. Code is available at https://github.com/Xiaofeng-life/ICDehazing. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.05675v1-abstract-full').style.display = 'none'; document.getElementById('2306.05675v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2303.18049">arXiv:2303.18049</a> <span> [<a href="https://arxiv.org/pdf/2303.18049">pdf</a>, <a href="https://arxiv.org/format/2303.18049">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> No Place to Hide: Dual Deep Interaction Channel Network for Fake News Detection based on Data Augmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cao%2C+B">Biwei Cao</a>, <a href="/search/cs?searchtype=author&query=Hua%2C+L">Lulu Hua</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+J">Jiuxin Cao</a>, <a href="/search/cs?searchtype=author&query=Gui%2C+J">Jie Gui</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+B">Bo Liu</a>, <a href="/search/cs?searchtype=author&query=Kwok%2C+J+T">James Tin-Yau Kwok</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2303.18049v1-abstract-short" style="display: inline;"> Online Social Network (OSN) has become a hotbed of fake news due to the low cost of information dissemination. Although the existing methods have made many attempts in news content and propagation structure, the detection of fake news is still facing two challenges: one is how to mine the unique key features and evolution patterns, and the other is how to tackle the problem of small samples to bui… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.18049v1-abstract-full').style.display = 'inline'; document.getElementById('2303.18049v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2303.18049v1-abstract-full" style="display: none;"> Online Social Network (OSN) has become a hotbed of fake news due to the low cost of information dissemination. Although the existing methods have made many attempts in news content and propagation structure, the detection of fake news is still facing two challenges: one is how to mine the unique key features and evolution patterns, and the other is how to tackle the problem of small samples to build the high-performance model. Different from popular methods which take full advantage of the propagation topology structure, in this paper, we propose a novel framework for fake news detection from perspectives of semantic, emotion and data enhancement, which excavates the emotional evolution patterns of news participants during the propagation process, and a dual deep interaction channel network of semantic and emotion is designed to obtain a more comprehensive and fine-grained news representation with the consideration of comments. Meanwhile, the framework introduces a data enhancement module to obtain more labeled data with high quality based on confidence which further improves the performance of the classification model. Experiments show that the proposed approach outperforms the state-of-the-art methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.18049v1-abstract-full').style.display = 'none'; document.getElementById('2303.18049v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2303.17255">arXiv:2303.17255</a> <span> [<a href="https://arxiv.org/pdf/2303.17255">pdf</a>, <a href="https://arxiv.org/format/2303.17255">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Fooling the Image Dehazing Models by First Order Gradient </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gui%2C+J">Jie Gui</a>, <a href="/search/cs?searchtype=author&query=Cong%2C+X">Xiaofeng Cong</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+C">Chengwei Peng</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Y+Y">Yuan Yan Tang</a>, <a href="/search/cs?searchtype=author&query=Kwok%2C+J+T">James Tin-Yau Kwok</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2303.17255v2-abstract-short" style="display: inline;"> The research on the single image dehazing task has been widely explored. However, as far as we know, no comprehensive study has been conducted on the robustness of the well-trained dehazing models. Therefore, there is no evidence that the dehazing networks can resist malicious attacks. In this paper, we focus on designing a group of attack methods based on first order gradient to verify the robust… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.17255v2-abstract-full').style.display = 'inline'; document.getElementById('2303.17255v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2303.17255v2-abstract-full" style="display: none;"> The research on the single image dehazing task has been widely explored. However, as far as we know, no comprehensive study has been conducted on the robustness of the well-trained dehazing models. Therefore, there is no evidence that the dehazing networks can resist malicious attacks. In this paper, we focus on designing a group of attack methods based on first order gradient to verify the robustness of the existing dehazing algorithms. By analyzing the general purpose of image dehazing task, four attack methods are proposed, which are predicted dehazed image attack, hazy layer mask attack, haze-free image attack and haze-preserved attack. The corresponding experiments are conducted on six datasets with different scales. Further, the defense strategy based on adversarial training is adopted for reducing the negative effects caused by malicious attacks. In summary, this paper defines a new challenging problem for the image dehazing area, which can be called as adversarial attack on dehazing networks (AADN). Code and Supplementary Material are available at https://github.com/Xiaofeng-life/AADN Dehazing. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.17255v2-abstract-full').style.display = 'none'; document.getElementById('2303.17255v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper is accepted by IEEE Transactions on Circuits and Systems for Video Technology (TCSVT)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2301.05712">arXiv:2301.05712</a> <span> [<a href="https://arxiv.org/pdf/2301.05712">pdf</a>, <a href="https://arxiv.org/format/2301.05712">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> A Survey on Self-supervised Learning: Algorithms, Applications, and Future Trends </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gui%2C+J">Jie Gui</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tuo Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jing Zhang</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+Q">Qiong Cao</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Z">Zhenan Sun</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+H">Hao Luo</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+D">Dacheng Tao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2301.05712v4-abstract-short" style="display: inline;"> Deep supervised learning algorithms typically require a large volume of labeled data to achieve satisfactory performance. However, the process of collecting and labeling such data can be expensive and time-consuming. Self-supervised learning (SSL), a subset of unsupervised learning, aims to learn discriminative features from unlabeled data without relying on human-annotated labels. SSL has garnere… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2301.05712v4-abstract-full').style.display = 'inline'; document.getElementById('2301.05712v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2301.05712v4-abstract-full" style="display: none;"> Deep supervised learning algorithms typically require a large volume of labeled data to achieve satisfactory performance. However, the process of collecting and labeling such data can be expensive and time-consuming. Self-supervised learning (SSL), a subset of unsupervised learning, aims to learn discriminative features from unlabeled data without relying on human-annotated labels. SSL has garnered significant attention recently, leading to the development of numerous related algorithms. However, there is a dearth of comprehensive studies that elucidate the connections and evolution of different SSL variants. This paper presents a review of diverse SSL methods, encompassing algorithmic aspects, application domains, three key trends, and open research questions. Firstly, we provide a detailed introduction to the motivations behind most SSL algorithms and compare their commonalities and differences. Secondly, we explore representative applications of SSL in domains such as image processing, computer vision, and natural language processing. Lastly, we discuss the three primary trends observed in SSL research and highlight the open questions that remain. A curated collection of valuable resources can be accessed at https://github.com/guijiejie/SSL. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2301.05712v4-abstract-full').style.display = 'none'; document.getElementById('2301.05712v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 January, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper is accepted by IEEE Transactions on Pattern Analysis and Machine Intelligence (TPAMI)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2301.03041">arXiv:2301.03041</a> <span> [<a href="https://arxiv.org/pdf/2301.03041">pdf</a>, <a href="https://arxiv.org/format/2301.03041">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/TIP.2023.3276708">10.1109/TIP.2023.3276708 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Learning the Relation between Similarity Loss and Clustering Loss in Self-Supervised Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ge%2C+J">Jidong Ge</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yuxiang Liu</a>, <a href="/search/cs?searchtype=author&query=Gui%2C+J">Jie Gui</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+L">Lanting Fang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+M">Ming Lin</a>, <a href="/search/cs?searchtype=author&query=Kwok%2C+J+T">James Tin-Yau Kwok</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+L">LiGuo Huang</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+B">Bin Luo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2301.03041v2-abstract-short" style="display: inline;"> Self-supervised learning enables networks to learn discriminative features from massive data itself. Most state-of-the-art methods maximize the similarity between two augmentations of one image based on contrastive learning. By utilizing the consistency of two augmentations, the burden of manual annotations can be freed. Contrastive learning exploits instance-level information to learn robust feat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2301.03041v2-abstract-full').style.display = 'inline'; document.getElementById('2301.03041v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2301.03041v2-abstract-full" style="display: none;"> Self-supervised learning enables networks to learn discriminative features from massive data itself. Most state-of-the-art methods maximize the similarity between two augmentations of one image based on contrastive learning. By utilizing the consistency of two augmentations, the burden of manual annotations can be freed. Contrastive learning exploits instance-level information to learn robust features. However, the learned information is probably confined to different views of the same instance. In this paper, we attempt to leverage the similarity between two distinct images to boost representation in self-supervised learning. In contrast to instance-level information, the similarity between two distinct images may provide more useful information. Besides, we analyze the relation between similarity loss and feature-level cross-entropy loss. These two losses are essential for most deep learning methods. However, the relation between these two losses is not clear. Similarity loss helps obtain instance-level representation, while feature-level cross-entropy loss helps mine the similarity between two distinct images. We provide theoretical analyses and experiments to show that a suitable combination of these two losses can get state-of-the-art results. Code is available at https://github.com/guijiejie/ICCL. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2301.03041v2-abstract-full').style.display = 'none'; document.getElementById('2301.03041v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 January, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper is accepted by IEEE Transactions on Image Processing</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2212.03112">arXiv:2212.03112</a> <span> [<a href="https://arxiv.org/pdf/2212.03112">pdf</a>, <a href="https://arxiv.org/format/2212.03112">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Fast Online Hashing with Multi-Label Projection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jia%2C+W">Wenzhe Jia</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+Y">Yuan Cao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Junwei Liu</a>, <a href="/search/cs?searchtype=author&query=Gui%2C+J">Jie Gui</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2212.03112v1-abstract-short" style="display: inline;"> Hashing has been widely researched to solve the large-scale approximate nearest neighbor search problem owing to its time and storage superiority. In recent years, a number of online hashing methods have emerged, which can update the hash functions to adapt to the new stream data and realize dynamic retrieval. However, existing online hashing methods are required to update the whole database with… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.03112v1-abstract-full').style.display = 'inline'; document.getElementById('2212.03112v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2212.03112v1-abstract-full" style="display: none;"> Hashing has been widely researched to solve the large-scale approximate nearest neighbor search problem owing to its time and storage superiority. In recent years, a number of online hashing methods have emerged, which can update the hash functions to adapt to the new stream data and realize dynamic retrieval. However, existing online hashing methods are required to update the whole database with the latest hash functions when a query arrives, which leads to low retrieval efficiency with the continuous increase of the stream data. On the other hand, these methods ignore the supervision relationship among the examples, especially in the multi-label case. In this paper, we propose a novel Fast Online Hashing (FOH) method which only updates the binary codes of a small part of the database. To be specific, we first build a query pool in which the nearest neighbors of each central point are recorded. When a new query arrives, only the binary codes of the corresponding potential neighbors are updated. In addition, we create a similarity matrix which takes the multi-label supervision information into account and bring in the multi-label projection loss to further preserve the similarity among the multi-label data. The experimental results on two common benchmarks show that the proposed FOH can achieve dramatic superiority on query time up to 6.28 seconds less than state-of-the-art baselines with competitive retrieval accuracy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.03112v1-abstract-full').style.display = 'none'; document.getElementById('2212.03112v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 December, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper is accepted by AAAI Conference on Artificial Intelligence (AAAI), 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2211.15362">arXiv:2211.15362</a> <span> [<a href="https://arxiv.org/pdf/2211.15362">pdf</a>, <a href="https://arxiv.org/format/2211.15362">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Exploring the Coordination of Frequency and Attention in Masked Image Modeling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gui%2C+J">Jie Gui</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tuo Chen</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+M">Minjing Dong</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhengqi Liu</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+H">Hao Luo</a>, <a href="/search/cs?searchtype=author&query=Kwok%2C+J+T">James Tin-Yau Kwok</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Y+Y">Yuan Yan Tang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2211.15362v3-abstract-short" style="display: inline;"> Recently, masked image modeling (MIM), which learns visual representations by reconstructing the masked patches of an image, has dominated self-supervised learning in computer vision. However, the pre-training of MIM always takes massive time due to the large-scale data and large-size backbones. We mainly attribute it to the random patch masking in previous MIM works, which fails to leverage the c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.15362v3-abstract-full').style.display = 'inline'; document.getElementById('2211.15362v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2211.15362v3-abstract-full" style="display: none;"> Recently, masked image modeling (MIM), which learns visual representations by reconstructing the masked patches of an image, has dominated self-supervised learning in computer vision. However, the pre-training of MIM always takes massive time due to the large-scale data and large-size backbones. We mainly attribute it to the random patch masking in previous MIM works, which fails to leverage the crucial semantic information for effective visual representation learning. To tackle this issue, we propose the Frequency \& Attention-driven Masking and Throwing Strategy (FAMT), which can extract semantic patches and reduce the number of training patches to boost model performance and training efficiency simultaneously. Specifically, FAMT utilizes the self-attention mechanism to extract semantic information from the image for masking during training in an unsupervised manner. However, attention alone could sometimes focus on inappropriate areas regarding the semantic information. Thus, we are motivated to incorporate the information from the frequency domain into the self-attention mechanism to derive the sampling weights for masking, which captures semantic patches for visual representation learning. Furthermore, we introduce a patch throwing strategy based on the derived sampling weights to reduce the training cost. FAMT can be seamlessly integrated as a plug-and-play module and surpasses previous works, \emph{e.g.} reducing the training phase time by nearly $50\%$ and improving the linear probing accuracy of MAE by $1.3\% \sim 3.9\%$ across various datasets, including CIFAR-10/100, Tiny ImageNet, and ImageNet-1K. FAMT also demonstrates superior performance in downstream detection and segmentation tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.15362v3-abstract-full').style.display = 'none'; document.getElementById('2211.15362v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2211.08736">arXiv:2211.08736</a> <span> [<a href="https://arxiv.org/pdf/2211.08736">pdf</a>, <a href="https://arxiv.org/format/2211.08736">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/TMM.2022.3222118">10.1109/TMM.2022.3222118 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> AlignVE: Visual Entailment Recognition Based on Alignment Relations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cao%2C+B">Biwei Cao</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+J">Jiuxin Cao</a>, <a href="/search/cs?searchtype=author&query=Gui%2C+J">Jie Gui</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+J">Jiayun Shen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+B">Bo Liu</a>, <a href="/search/cs?searchtype=author&query=He%2C+L">Lei He</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Y+Y">Yuan Yan Tang</a>, <a href="/search/cs?searchtype=author&query=Kwok%2C+J+T">James Tin-Yau Kwok</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2211.08736v1-abstract-short" style="display: inline;"> Visual entailment (VE) is to recognize whether the semantics of a hypothesis text can be inferred from the given premise image, which is one special task among recent emerged vision and language understanding tasks. Currently, most of the existing VE approaches are derived from the methods of visual question answering. They recognize visual entailment by quantifying the similarity between the hypo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.08736v1-abstract-full').style.display = 'inline'; document.getElementById('2211.08736v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2211.08736v1-abstract-full" style="display: none;"> Visual entailment (VE) is to recognize whether the semantics of a hypothesis text can be inferred from the given premise image, which is one special task among recent emerged vision and language understanding tasks. Currently, most of the existing VE approaches are derived from the methods of visual question answering. They recognize visual entailment by quantifying the similarity between the hypothesis and premise in the content semantic features from multi modalities. Such approaches, however, ignore the VE's unique nature of relation inference between the premise and hypothesis. Therefore, in this paper, a new architecture called AlignVE is proposed to solve the visual entailment problem with a relation interaction method. It models the relation between the premise and hypothesis as an alignment matrix. Then it introduces a pooling operation to get feature vectors with a fixed size. Finally, it goes through the fully-connected layer and normalization layer to complete the classification. Experiments show that our alignment-based architecture reaches 72.45\% accuracy on SNLI-VE dataset, outperforming previous content-based models under the same settings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.08736v1-abstract-full').style.display = 'none'; document.getElementById('2211.08736v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper is accepted for publication as a REGULAR paper in the IEEE Transactions on Multimedia</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2110.10709">arXiv:2110.10709</a> <span> [<a href="https://arxiv.org/pdf/2110.10709">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Medical Physics">physics.med-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Predicting Tau Accumulation in Cerebral Cortex with Multivariate MRI Morphometry Measurements, Sparse Coding, and Correntropy </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jianfeng Wu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+W">Wenhui Zhu</a>, <a href="/search/cs?searchtype=author&query=Su%2C+Y">Yi Su</a>, <a href="/search/cs?searchtype=author&query=Gui%2C+J">Jie Gui</a>, <a href="/search/cs?searchtype=author&query=Lepore%2C+N">Natasha Lepore</a>, <a href="/search/cs?searchtype=author&query=Reiman%2C+E+M">Eric M. Reiman</a>, <a href="/search/cs?searchtype=author&query=Caselli%2C+R+J">Richard J. Caselli</a>, <a href="/search/cs?searchtype=author&query=Thompson%2C+P+M">Paul M. Thompson</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+K">Kewei Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yalin Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2110.10709v1-abstract-short" style="display: inline;"> Biomarker-assisted diagnosis and intervention in Alzheimer's disease (AD) may be the key to prevention breakthroughs. One of the hallmarks of AD is the accumulation of tau plaques in the human brain. However, current methods to detect tau pathology are either invasive (lumbar puncture) or quite costly and not widely available (Tau PET). In our previous work, structural MRI-based hippocampal multiv… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.10709v1-abstract-full').style.display = 'inline'; document.getElementById('2110.10709v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2110.10709v1-abstract-full" style="display: none;"> Biomarker-assisted diagnosis and intervention in Alzheimer's disease (AD) may be the key to prevention breakthroughs. One of the hallmarks of AD is the accumulation of tau plaques in the human brain. However, current methods to detect tau pathology are either invasive (lumbar puncture) or quite costly and not widely available (Tau PET). In our previous work, structural MRI-based hippocampal multivariate morphometry statistics (MMS) showed superior performance as an effective neurodegenerative biomarker for preclinical AD and Patch Analysis-based Surface Correntropy-induced Sparse coding and max-pooling (PASCS-MP) has excellent ability to generate low-dimensional representations with strong statistical power for brain amyloid prediction. In this work, we apply this framework together with ridge regression models to predict Tau deposition in Braak12 and Braak34 brain regions separately. We evaluate our framework on 925 subjects from the Alzheimer's Disease Neuroimaging Initiative (ADNI). Each subject has one pair consisting of a PET image and MRI scan which were collected at about the same times. Experimental results suggest that the representations from our MMS and PASCS-MP have stronger predictive power and their predicted Braak12 and Braak34 are closer to the real values compared to the measures derived from other approaches such as hippocampal surface area and volume, and shape morphometry features based on spherical harmonics (SPHARM). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.10709v1-abstract-full').style.display = 'none'; document.getElementById('2110.10709v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 October, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 5 figures, 17th International Symposium on Medical Information Processing and Analysis</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2106.06996">arXiv:2106.06996</a> <span> [<a href="https://arxiv.org/pdf/2106.06996">pdf</a>, <a href="https://arxiv.org/format/2106.06996">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Pyramidal Dense Attention Networks for Lightweight Image Super-Resolution </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+H">Huapeng Wu</a>, <a href="/search/cs?searchtype=author&query=Gui%2C+J">Jie Gui</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jun Zhang</a>, <a href="/search/cs?searchtype=author&query=Kwok%2C+J+T">James T. Kwok</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+Z">Zhihui Wei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2106.06996v1-abstract-short" style="display: inline;"> Recently, deep convolutional neural network methods have achieved an excellent performance in image superresolution (SR), but they can not be easily applied to embedded devices due to large memory cost. To solve this problem, we propose a pyramidal dense attention network (PDAN) for lightweight image super-resolution in this paper. In our method, the proposed pyramidal dense learning can gradually… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2106.06996v1-abstract-full').style.display = 'inline'; document.getElementById('2106.06996v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2106.06996v1-abstract-full" style="display: none;"> Recently, deep convolutional neural network methods have achieved an excellent performance in image superresolution (SR), but they can not be easily applied to embedded devices due to large memory cost. To solve this problem, we propose a pyramidal dense attention network (PDAN) for lightweight image super-resolution in this paper. In our method, the proposed pyramidal dense learning can gradually increase the width of the densely connected layer inside a pyramidal dense block to extract deep features efficiently. Meanwhile, the adaptive group convolution that the number of groups grows linearly with dense convolutional layers is introduced to relieve the parameter explosion. Besides, we also present a novel joint attention to capture cross-dimension interaction between the spatial dimensions and channel dimension in an efficient way for providing rich discriminative feature representations. Extensive experimental results show that our method achieves superior performance in comparison with the state-of-the-art lightweight SR methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2106.06996v1-abstract-full').style.display = 'none'; document.getElementById('2106.06996v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 June, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2106.06966">arXiv:2106.06966</a> <span> [<a href="https://arxiv.org/pdf/2106.06966">pdf</a>, <a href="https://arxiv.org/format/2106.06966">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Feedback Pyramid Attention Networks for Single Image Super-Resolution </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+H">Huapeng Wu</a>, <a href="/search/cs?searchtype=author&query=Gui%2C+J">Jie Gui</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jun Zhang</a>, <a href="/search/cs?searchtype=author&query=Kwok%2C+J+T">James T. Kwok</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+Z">Zhihui Wei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2106.06966v1-abstract-short" style="display: inline;"> Recently, convolutional neural network (CNN) based image super-resolution (SR) methods have achieved significant performance improvement. However, most CNN-based methods mainly focus on feed-forward architecture design and neglect to explore the feedback mechanism, which usually exists in the human visual system. In this paper, we propose feedback pyramid attention networks (FPAN) to fully exploit… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2106.06966v1-abstract-full').style.display = 'inline'; document.getElementById('2106.06966v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2106.06966v1-abstract-full" style="display: none;"> Recently, convolutional neural network (CNN) based image super-resolution (SR) methods have achieved significant performance improvement. However, most CNN-based methods mainly focus on feed-forward architecture design and neglect to explore the feedback mechanism, which usually exists in the human visual system. In this paper, we propose feedback pyramid attention networks (FPAN) to fully exploit the mutual dependencies of features. Specifically, a novel feedback connection structure is developed to enhance low-level feature expression with high-level information. In our method, the output of each layer in the first stage is also used as the input of the corresponding layer in the next state to re-update the previous low-level filters. Moreover, we introduce a pyramid non-local structure to model global contextual information in different scales and improve the discriminative representation of the network. Extensive experimental results on various datasets demonstrate the superiority of our FPAN in comparison with the state-of-the-art SR methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2106.06966v1-abstract-full').style.display = 'none'; document.getElementById('2106.06966v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 June, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2106.03323">arXiv:2106.03323</a> <span> [<a href="https://arxiv.org/pdf/2106.03323">pdf</a>, <a href="https://arxiv.org/format/2106.03323">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> A Comprehensive Survey and Taxonomy on Single Image Dehazing Based on Deep Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gui%2C+J">Jie Gui</a>, <a href="/search/cs?searchtype=author&query=Cong%2C+X">Xiaofeng Cong</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+Y">Yuan Cao</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+W">Wenqi Ren</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jun Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jing Zhang</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+J">Jiuxin Cao</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+D">Dacheng Tao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2106.03323v5-abstract-short" style="display: inline;"> With the development of convolutional neural networks, hundreds of deep learning based dehazing methods have been proposed. In this paper, we provide a comprehensive survey on supervised, semi-supervised, and unsupervised single image dehazing. We first discuss the physical model, datasets, network modules, loss functions, and evaluation metrics that are commonly used. Then, the main contributions… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2106.03323v5-abstract-full').style.display = 'inline'; document.getElementById('2106.03323v5-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2106.03323v5-abstract-full" style="display: none;"> With the development of convolutional neural networks, hundreds of deep learning based dehazing methods have been proposed. In this paper, we provide a comprehensive survey on supervised, semi-supervised, and unsupervised single image dehazing. We first discuss the physical model, datasets, network modules, loss functions, and evaluation metrics that are commonly used. Then, the main contributions of various dehazing algorithms are categorized and summarized. Further, quantitative and qualitative experiments of various baseline methods are carried out. Finally, the unsolved issues and challenges that can inspire the future research are pointed out. A collection of useful dehazing materials is available at \url{https://github.com/Xiaofeng-life/AwesomeDehazing}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2106.03323v5-abstract-full').style.display = 'none'; document.getElementById('2106.03323v5-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 December, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 June, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper is accepted by ACM Computing Surveys</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2104.00453">arXiv:2104.00453</a> <span> [<a href="https://arxiv.org/pdf/2104.00453">pdf</a>, <a href="https://arxiv.org/ps/2104.00453">ps</a>, <a href="https://arxiv.org/format/2104.00453">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Functional Analysis">math.FA</span> </div> </div> <p class="title is-5 mathjax"> Learning Rates for Multi-task Regularization Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gui%2C+J">Jie Gui</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Haizhang Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2104.00453v3-abstract-short" style="display: inline;"> Multi-task learning is an important trend of machine learning in facing the era of artificial intelligence and big data. Despite a large amount of researches on learning rate estimates of various single-task machine learning algorithms, there is little parallel work for multi-task learning. We present mathematical analysis on the learning rate estimate of multi-task learning based on the theory of… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2104.00453v3-abstract-full').style.display = 'inline'; document.getElementById('2104.00453v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2104.00453v3-abstract-full" style="display: none;"> Multi-task learning is an important trend of machine learning in facing the era of artificial intelligence and big data. Despite a large amount of researches on learning rate estimates of various single-task machine learning algorithms, there is little parallel work for multi-task learning. We present mathematical analysis on the learning rate estimate of multi-task learning based on the theory of vector-valued reproducing kernel Hilbert spaces and matrix-valued reproducing kernels. For the typical multi-task regularization networks, an explicit learning rate dependent both on the number of sample data and the number of tasks is obtained. It reveals that the generalization ability of multi-task learning algorithms is indeed affected as the number of tasks increases. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2104.00453v3-abstract-full').style.display = 'none'; document.getElementById('2104.00453v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 September, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 April, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2103.11590">arXiv:2103.11590</a> <span> [<a href="https://arxiv.org/pdf/2103.11590">pdf</a>, <a href="https://arxiv.org/format/2103.11590">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Delving into Variance Transmission and Normalization: Shift of Average Gradient Makes the Network Collapse </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yuxiang Liu</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+J">Jidong Ge</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chuanyi Li</a>, <a href="/search/cs?searchtype=author&query=Gui%2C+J">Jie Gui</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2103.11590v1-abstract-short" style="display: inline;"> Normalization operations are essential for state-of-the-art neural networks and enable us to train a network from scratch with a large learning rate (LR). We attempt to explain the real effect of Batch Normalization (BN) from the perspective of variance transmission by investigating the relationship between BN and Weights Normalization (WN). In this work, we demonstrate that the problem of the shi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2103.11590v1-abstract-full').style.display = 'inline'; document.getElementById('2103.11590v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2103.11590v1-abstract-full" style="display: none;"> Normalization operations are essential for state-of-the-art neural networks and enable us to train a network from scratch with a large learning rate (LR). We attempt to explain the real effect of Batch Normalization (BN) from the perspective of variance transmission by investigating the relationship between BN and Weights Normalization (WN). In this work, we demonstrate that the problem of the shift of the average gradient will amplify the variance of every convolutional (conv) layer. We propose Parametric Weights Standardization (PWS), a fast and robust to mini-batch size module used for conv filters, to solve the shift of the average gradient. PWS can provide the speed-up of BN. Besides, it has less computation and does not change the output of a conv layer. PWS enables the network to converge fast without normalizing the outputs. This result enhances the persuasiveness of the shift of the average gradient and explains why BN works from the perspective of variance transmission. The code and appendix will be made available on https://github.com/lyxzzz/PWSConv. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2103.11590v1-abstract-full').style.display = 'none'; document.getElementById('2103.11590v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 March, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper has been accepted by AAAI21</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2005.07427">arXiv:2005.07427</a> <span> [<a href="https://arxiv.org/pdf/2005.07427">pdf</a>, <a href="https://arxiv.org/format/2005.07427">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Structural Temporal Graph Neural Networks for Anomaly Detection in Dynamic Graphs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cai%2C+L">Lei Cai</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhengzhang Chen</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+C">Chen Luo</a>, <a href="/search/cs?searchtype=author&query=Gui%2C+J">Jiaping Gui</a>, <a href="/search/cs?searchtype=author&query=Ni%2C+J">Jingchao Ni</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Ding Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Haifeng Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2005.07427v2-abstract-short" style="display: inline;"> Detecting anomalies in dynamic graphs is a vital task, with numerous practical applications in areas such as security, finance, and social media. Previous network embedding based methods have been mostly focusing on learning good node representations, whereas largely ignoring the subgraph structural changes related to the target nodes in dynamic graphs. In this paper, we propose StrGNN, an end-to-… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2005.07427v2-abstract-full').style.display = 'inline'; document.getElementById('2005.07427v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2005.07427v2-abstract-full" style="display: none;"> Detecting anomalies in dynamic graphs is a vital task, with numerous practical applications in areas such as security, finance, and social media. Previous network embedding based methods have been mostly focusing on learning good node representations, whereas largely ignoring the subgraph structural changes related to the target nodes in dynamic graphs. In this paper, we propose StrGNN, an end-to-end structural temporal Graph Neural Network model for detecting anomalous edges in dynamic graphs. In particular, we first extract the $h$-hop enclosing subgraph centered on the target edge and propose the node labeling function to identify the role of each node in the subgraph. Then, we leverage graph convolution operation and Sortpooling layer to extract the fixed-size feature from each snapshot/timestamp. Based on the extracted features, we utilize Gated recurrent units (GRUs) to capture the temporal information for anomaly detection. Extensive experiments on six benchmark datasets and a real enterprise security system demonstrate the effectiveness of StrGNN. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2005.07427v2-abstract-full').style.display = 'none'; document.getElementById('2005.07427v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 May, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 May, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2020. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2004.01143">arXiv:2004.01143</a> <span> [<a href="https://arxiv.org/pdf/2004.01143">pdf</a>, <a href="https://arxiv.org/format/2004.01143">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Randomized Kernel Multi-view Discriminant Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiaoyun Li</a>, <a href="/search/cs?searchtype=author&query=Gui%2C+J">Jie Gui</a>, <a href="/search/cs?searchtype=author&query=Li%2C+P">Ping Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2004.01143v1-abstract-short" style="display: inline;"> In many artificial intelligence and computer vision systems, the same object can be observed at distinct viewpoints or by diverse sensors, which raises the challenges for recognizing objects from different, even heterogeneous views. Multi-view discriminant analysis (MvDA) is an effective multi-view subspace learning method, which finds a discriminant common subspace by jointly learning multiple vi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2004.01143v1-abstract-full').style.display = 'inline'; document.getElementById('2004.01143v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2004.01143v1-abstract-full" style="display: none;"> In many artificial intelligence and computer vision systems, the same object can be observed at distinct viewpoints or by diverse sensors, which raises the challenges for recognizing objects from different, even heterogeneous views. Multi-view discriminant analysis (MvDA) is an effective multi-view subspace learning method, which finds a discriminant common subspace by jointly learning multiple view-specific linear projections for object recognition from multiple views, in a non-pairwise way. In this paper, we propose the kernel version of multi-view discriminant analysis, called kernel multi-view discriminant analysis (KMvDA). To overcome the well-known computational bottleneck of kernel methods, we also study the performance of using random Fourier features (RFF) to approximate Gaussian kernels in KMvDA, for large scale learning. Theoretical analysis on stability of this approximation is developed. We also conduct experiments on several popular multi-view datasets to illustrate the effectiveness of our proposed strategy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2004.01143v1-abstract-full').style.display = 'none'; document.getElementById('2004.01143v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 April, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2020. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2001.06937">arXiv:2001.06937</a> <span> [<a href="https://arxiv.org/pdf/2001.06937">pdf</a>, <a href="https://arxiv.org/format/2001.06937">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> A Review on Generative Adversarial Networks: Algorithms, Theory, and Applications </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gui%2C+J">Jie Gui</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Z">Zhenan Sun</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+Y">Yonggang Wen</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+D">Dacheng Tao</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+J">Jieping Ye</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2001.06937v1-abstract-short" style="display: inline;"> Generative adversarial networks (GANs) are a hot research topic recently. GANs have been widely studied since 2014, and a large number of algorithms have been proposed. However, there is few comprehensive study explaining the connections among different GANs variants, and how they have evolved. In this paper, we attempt to provide a review on various GANs methods from the perspectives of algorithm… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2001.06937v1-abstract-full').style.display = 'inline'; document.getElementById('2001.06937v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2001.06937v1-abstract-full" style="display: none;"> Generative adversarial networks (GANs) are a hot research topic recently. GANs have been widely studied since 2014, and a large number of algorithms have been proposed. However, there is few comprehensive study explaining the connections among different GANs variants, and how they have evolved. In this paper, we attempt to provide a review on various GANs methods from the perspectives of algorithms, theory, and applications. Firstly, the motivations, mathematical representations, and structure of most GANs algorithms are introduced in details. Furthermore, GANs have been combined with other machine learning algorithms for specific applications, such as semi-supervised learning, transfer learning, and reinforcement learning. This paper compares the commonalities and differences of these GANs methods. Secondly, theoretical issues related to GANs are investigated. Thirdly, typical applications of GANs in image processing and computer vision, natural language processing, music, speech and audio, medical field, and data science are illustrated. Finally, the future open research problems for GANs are pointed out. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2001.06937v1-abstract-full').style.display = 'none'; document.getElementById('2001.06937v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 January, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2020. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1912.00398">arXiv:1912.00398</a> <span> [<a href="https://arxiv.org/pdf/1912.00398">pdf</a>, <a href="https://arxiv.org/format/1912.00398">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Deep Human Answer Understanding for Natural Reverse QA </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yao%2C+R">Rujing Yao</a>, <a href="/search/cs?searchtype=author&query=Hou%2C+L">Linlin Hou</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+L">Lei Yang</a>, <a href="/search/cs?searchtype=author&query=Gui%2C+J">Jie Gui</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+Q">Qing Yin</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+O">Ou Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1912.00398v2-abstract-short" style="display: inline;"> This study focuses on a reverse question answering (QA) procedure, in which machines proactively raise questions and humans supply the answers. This procedure exists in many real human-machine interaction applications. However, a crucial problem in human-machine interaction is answer understanding. The existing solutions have relied on mandatory option term selection to avoid automatic answer unde… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1912.00398v2-abstract-full').style.display = 'inline'; document.getElementById('1912.00398v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1912.00398v2-abstract-full" style="display: none;"> This study focuses on a reverse question answering (QA) procedure, in which machines proactively raise questions and humans supply the answers. This procedure exists in many real human-machine interaction applications. However, a crucial problem in human-machine interaction is answer understanding. The existing solutions have relied on mandatory option term selection to avoid automatic answer understanding. However, these solutions have led to unnatural human-computer interaction and negatively affected user experience. To this end, the current study proposes a novel deep answer understanding network, called AntNet, for reverse QA. The network consists of three new modules, namely, skeleton attention for questions, relevance-aware representation of answers, and multi-hop based fusion. As answer understanding for reverse QA has not been explored, a new data corpus is compiled in this study. Experimental results indicate that our proposed network is significantly better than existing methods and those modified from classical natural language processing deep models. The effectiveness of the three new modules is also verified. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1912.00398v2-abstract-full').style.display = 'none'; document.getElementById('1912.00398v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 November, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 December, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2019. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1910.08074">arXiv:1910.08074</a> <span> [<a href="https://arxiv.org/pdf/1910.08074">pdf</a>, <a href="https://arxiv.org/format/1910.08074">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Heterogeneous Graph Matching Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shen Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhengzhang Chen</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+X">Xiao Yu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Ding Li</a>, <a href="/search/cs?searchtype=author&query=Ni%2C+J">Jingchao Ni</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+L">Lu-An Tang</a>, <a href="/search/cs?searchtype=author&query=Gui%2C+J">Jiaping Gui</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhichun Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Haifeng Chen</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+P+S">Philip S. Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1910.08074v1-abstract-short" style="display: inline;"> Information systems have widely been the target of malware attacks. Traditional signature-based malicious program detection algorithms can only detect known malware and are prone to evasion techniques such as binary obfuscation, while behavior-based approaches highly rely on the malware training samples and incur prohibitively high training cost. To address the limitations of existing techniques,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1910.08074v1-abstract-full').style.display = 'inline'; document.getElementById('1910.08074v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1910.08074v1-abstract-full" style="display: none;"> Information systems have widely been the target of malware attacks. Traditional signature-based malicious program detection algorithms can only detect known malware and are prone to evasion techniques such as binary obfuscation, while behavior-based approaches highly rely on the malware training samples and incur prohibitively high training cost. To address the limitations of existing techniques, we propose MatchGNet, a heterogeneous Graph Matching Network model to learn the graph representation and similarity metric simultaneously based on the invariant graph modeling of the program's execution behaviors. We conduct a systematic evaluation of our model and show that it is accurate in detecting malicious program behavior and can help detect malware attacks with less false positives. MatchGNet outperforms the state-of-the-art algorithms in malware detection by generating 50% less false positives while keeping zero false negatives. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1910.08074v1-abstract-full').style.display = 'none'; document.getElementById('1910.08074v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2019. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1909.11937">arXiv:1909.11937</a> <span> [<a href="https://arxiv.org/pdf/1909.11937">pdf</a>, <a href="https://arxiv.org/format/1909.11937">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Multi-grained Attention Networks for Single Image Super-Resolution </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+H">Huapeng Wu</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+Z">Zhengxia Zou</a>, <a href="/search/cs?searchtype=author&query=Gui%2C+J">Jie Gui</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+W">Wen-Jun Zeng</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+J">Jieping Ye</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jun Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Hongyi Liu</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+Z">Zhihui Wei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1909.11937v2-abstract-short" style="display: inline;"> Deep Convolutional Neural Networks (CNN) have drawn great attention in image super-resolution (SR). Recently, visual attention mechanism, which exploits both of the feature importance and contextual cues, has been introduced to image SR and proves to be effective to improve CNN-based SR performance. In this paper, we make a thorough investigation on the attention mechanisms in a SR model and shed… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1909.11937v2-abstract-full').style.display = 'inline'; document.getElementById('1909.11937v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1909.11937v2-abstract-full" style="display: none;"> Deep Convolutional Neural Networks (CNN) have drawn great attention in image super-resolution (SR). Recently, visual attention mechanism, which exploits both of the feature importance and contextual cues, has been introduced to image SR and proves to be effective to improve CNN-based SR performance. In this paper, we make a thorough investigation on the attention mechanisms in a SR model and shed light on how simple and effective improvements on these ideas improve the state-of-the-arts. We further propose a unified approach called "multi-grained attention networks (MGAN)" which fully exploits the advantages of multi-scale and attention mechanisms in SR tasks. In our method, the importance of each neuron is computed according to its surrounding regions in a multi-grained fashion and then is used to adaptively re-scale the feature responses. More importantly, the "channel attention" and "spatial attention" strategies in previous methods can be essentially considered as two special cases of our method. We also introduce multi-scale dense connections to extract the image features at multiple scales and capture the features of different layers through dense skip connections. Ablation studies on benchmark datasets demonstrate the effectiveness of our method. In comparison with other state-of-the-art SR methods, our method shows the superiority in terms of both accuracy and model size. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1909.11937v2-abstract-full').style.display = 'none'; document.getElementById('1909.11937v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 September, 2019; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 September, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2019. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1904.04088">arXiv:1904.04088</a> <span> [<a href="https://arxiv.org/pdf/1904.04088">pdf</a>, <a href="https://arxiv.org/format/1904.04088">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/TIP.2015.2495116">10.1109/TIP.2015.2495116 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Large Margin Multi-modal Multi-task Feature Extraction for Image Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Luo%2C+Y">Yong Luo</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+Y">Yonggang Wen</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+D">Dacheng Tao</a>, <a href="/search/cs?searchtype=author&query=Gui%2C+J">Jie Gui</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+C">Chao Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1904.04088v1-abstract-short" style="display: inline;"> The features used in many image analysis-based applications are frequently of very high dimension. Feature extraction offers several advantages in high-dimensional cases, and many recent studies have used multi-task feature extraction approaches, which often outperform single-task feature extraction approaches. However, most of these methods are limited in that they only consider data represented… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1904.04088v1-abstract-full').style.display = 'inline'; document.getElementById('1904.04088v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1904.04088v1-abstract-full" style="display: none;"> The features used in many image analysis-based applications are frequently of very high dimension. Feature extraction offers several advantages in high-dimensional cases, and many recent studies have used multi-task feature extraction approaches, which often outperform single-task feature extraction approaches. However, most of these methods are limited in that they only consider data represented by a single type of feature, even though features usually represent images from multiple modalities. We therefore propose a novel large margin multi-modal multi-task feature extraction (LM3FE) framework for handling multi-modal features for image classification. In particular, LM3FE simultaneously learns the feature extraction matrix for each modality and the modality combination coefficients. In this way, LM3FE not only handles correlated and noisy features, but also utilizes the complementarity of different modalities to further help reduce feature redundancy in each modality. The large margin principle employed also helps to extract strongly predictive features so that they are more suitable for prediction (e.g., classification). An alternating algorithm is developed for problem optimization and each sub-problem can be efficiently solved. Experiments on two challenging real-world image datasets demonstrate the effectiveness and superiority of the proposed method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1904.04088v1-abstract-full').style.display = 'none'; document.getElementById('1904.04088v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 April, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> IEEE Transactions on Image Processing (Volume: 25, Issue: 1, Jan. 2016) </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1904.03556">arXiv:1904.03556</a> <span> [<a href="https://arxiv.org/pdf/1904.03556">pdf</a>, <a href="https://arxiv.org/format/1904.03556">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Fast Supervised Discrete Hashing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gui%2C+J">Jie Gui</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+T">Tongliang Liu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Z">Zhenan Sun</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+D">Dacheng Tao</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+T">Tieniu Tan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1904.03556v1-abstract-short" style="display: inline;"> Learning-based hashing algorithms are ``hot topics" because they can greatly increase the scale at which existing methods operate. In this paper, we propose a new learning-based hashing method called ``fast supervised discrete hashing" (FSDH) based on ``supervised discrete hashing" (SDH). Regressing the training examples (or hash code) to the corresponding class labels is widely used in ordinary l… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1904.03556v1-abstract-full').style.display = 'inline'; document.getElementById('1904.03556v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1904.03556v1-abstract-full" style="display: none;"> Learning-based hashing algorithms are ``hot topics" because they can greatly increase the scale at which existing methods operate. In this paper, we propose a new learning-based hashing method called ``fast supervised discrete hashing" (FSDH) based on ``supervised discrete hashing" (SDH). Regressing the training examples (or hash code) to the corresponding class labels is widely used in ordinary least squares regression. Rather than adopting this method, FSDH uses a very simple yet effective regression of the class labels of training examples to the corresponding hash code to accelerate the algorithm. To the best of our knowledge, this strategy has not previously been used for hashing. Traditional SDH decomposes the optimization into three sub-problems, with the most critical sub-problem - discrete optimization for binary hash codes - solved using iterative discrete cyclic coordinate descent (DCC), which is time-consuming. However, FSDH has a closed-form solution and only requires a single rather than iterative hash code-solving step, which is highly efficient. Furthermore, FSDH is usually faster than SDH for solving the projection matrix for least squares regression, making FSDH generally faster than SDH. For example, our results show that FSDH is about 12-times faster than SDH when the number of hashing bits is 128 on the CIFAR-10 data base, and FSDH is about 151-times faster than FastHash when the number of hashing bits is 64 on the MNIST data-base. Our experimental results show that FSDH is not only fast, but also outperforms other comparative methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1904.03556v1-abstract-full').style.display = 'none'; document.getElementById('1904.03556v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 April, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2019. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1904.03549">arXiv:1904.03549</a> <span> [<a href="https://arxiv.org/pdf/1904.03549">pdf</a>, <a href="https://arxiv.org/format/1904.03549">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Supervised Discrete Hashing with Relaxation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gui%2C+J">Jie Gui</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+T">Tongliang Liu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Z">Zhenan Sun</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+D">Dacheng Tao</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+T">Tieniu Tan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1904.03549v1-abstract-short" style="display: inline;"> Data-dependent hashing has recently attracted attention due to being able to support efficient retrieval and storage of high-dimensional data such as documents, images, and videos. In this paper, we propose a novel learning-based hashing method called "Supervised Discrete Hashing with Relaxation" (SDHR) based on "Supervised Discrete Hashing" (SDH). SDH uses ordinary least squares regression and tr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1904.03549v1-abstract-full').style.display = 'inline'; document.getElementById('1904.03549v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1904.03549v1-abstract-full" style="display: none;"> Data-dependent hashing has recently attracted attention due to being able to support efficient retrieval and storage of high-dimensional data such as documents, images, and videos. In this paper, we propose a novel learning-based hashing method called "Supervised Discrete Hashing with Relaxation" (SDHR) based on "Supervised Discrete Hashing" (SDH). SDH uses ordinary least squares regression and traditional zero-one matrix encoding of class label information as the regression target (code words), thus fixing the regression target. In SDHR, the regression target is instead optimized. The optimized regression target matrix satisfies a large margin constraint for correct classification of each example. Compared with SDH, which uses the traditional zero-one matrix, SDHR utilizes the learned regression target matrix and, therefore, more accurately measures the classification error of the regression model and is more flexible. As expected, SDHR generally outperforms SDH. Experimental results on two large-scale image datasets (CIFAR-10 and MNIST) and a large-scale and challenging face dataset (FRGC) demonstrate the effectiveness and efficiency of SDHR. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1904.03549v1-abstract-full').style.display = 'none'; document.getElementById('1904.03549v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 April, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2019. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1702.07681">arXiv:1702.07681</a> <span> [<a href="https://arxiv.org/pdf/1702.07681">pdf</a>, <a href="https://arxiv.org/format/1702.07681">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> What Aspects of Mobile Ads Do Users Care About? An Empirical Study of Mobile In-app Ad Reviews </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gui%2C+J">Jiaping Gui</a>, <a href="/search/cs?searchtype=author&query=Nagappan%2C+M">Meiyappan Nagappan</a>, <a href="/search/cs?searchtype=author&query=Halfond%2C+W+G+J">William G. J. Halfond</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1702.07681v1-abstract-short" style="display: inline;"> In the mobile app ecosystem, developers receive ad revenue by placing ads in their apps and releasing them for free. While there is evidence that users do not like ads, we do not know what are the aspects of ads that users dislike nor if they dislike certain aspects of ads more than others. Therefore, in this paper, we analyzed the different topics of ad related complaints from users. In order to… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1702.07681v1-abstract-full').style.display = 'inline'; document.getElementById('1702.07681v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1702.07681v1-abstract-full" style="display: none;"> In the mobile app ecosystem, developers receive ad revenue by placing ads in their apps and releasing them for free. While there is evidence that users do not like ads, we do not know what are the aspects of ads that users dislike nor if they dislike certain aspects of ads more than others. Therefore, in this paper, we analyzed the different topics of ad related complaints from users. In order to do this, we investigated app reviews that users gave for apps in the app store that were about ads. We manually examined a random sample set of 400 ad reviews to identify ad complaint topics. We found that most ad complaints were about user interface (UI) related topics and three topics were brought up the most often: the frequency with which ads were displayed, the timing of when ads were displayed, and the location of the displayed ads. Our results provide actionable information to software developers regarding the aspects of ads that are most likely to be complained about by users in their reviews. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1702.07681v1-abstract-full').style.display = 'none'; document.getElementById('1702.07681v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 February, 2017; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2017. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages</span> </p> </li> </ol> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository