Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 166 results for author: <span class="mathjax">Nie, J</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Nie%2C+J">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Nie, J"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Nie%2C+J&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Nie, J"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Nie%2C+J&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Nie%2C+J&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Nie%2C+J&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Nie%2C+J&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Nie%2C+J&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.02748">arXiv:2502.02748</a> <span> [<a href="https://arxiv.org/pdf/2502.02748">pdf</a>, <a href="https://arxiv.org/format/2502.02748">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Materials Science">cond-mat.mtrl-sci</span> </div> </div> <p class="title is-5 mathjax"> ReGNet: Reciprocal Space-Aware Long-Range Modeling and Multi-Property Prediction for Crystals </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Nie%2C+J">Jianan Nie</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+P">Peiyao Xiao</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+K">Kaiyi Ji</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+P">Peng Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.02748v1-abstract-short" style="display: inline;"> Predicting properties of crystals from their structures is a fundamental yet challenging task in materials science. Unlike molecules, crystal structures exhibit infinite periodic arrangements of atoms, requiring methods capable of capturing both local and global information effectively. However, most current works fall short of capturing long-range interactions within periodic structures. To addre… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02748v1-abstract-full').style.display = 'inline'; document.getElementById('2502.02748v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.02748v1-abstract-full" style="display: none;"> Predicting properties of crystals from their structures is a fundamental yet challenging task in materials science. Unlike molecules, crystal structures exhibit infinite periodic arrangements of atoms, requiring methods capable of capturing both local and global information effectively. However, most current works fall short of capturing long-range interactions within periodic structures. To address this limitation, we leverage reciprocal space to efficiently encode long-range interactions with learnable filters within Fourier transforms. We introduce Reciprocal Geometry Network (ReGNet), a novel architecture that integrates geometric GNNs and reciprocal blocks to model short-range and long-range interactions, respectively. Additionally, we introduce ReGNet-MT, a multi-task extension that employs mixture of experts (MoE) for multi-property prediction. Experimental results on the JARVIS and Materials Project benchmarks demonstrate that ReGNet achieves significant performance improvements. Moreover, ReGNet-MT attains state-of-the-art results on two bandgap properties due to positive transfer, while maintaining high computational efficiency. These findings highlight the potential of our model as a scalable and accurate solution for crystal property prediction. The code will be released upon paper acceptance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02748v1-abstract-full').style.display = 'none'; document.getElementById('2502.02748v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.14633">arXiv:2412.14633</a> <span> [<a href="https://arxiv.org/pdf/2412.14633">pdf</a>, <a href="https://arxiv.org/format/2412.14633">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Progressive Fine-to-Coarse Reconstruction for Accurate Low-Bit Post-Training Quantization in Vision Transformers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ding%2C+R">Rui Ding</a>, <a href="/search/cs?searchtype=author&query=Yong%2C+L">Liang Yong</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Sihuan Zhao</a>, <a href="/search/cs?searchtype=author&query=Nie%2C+J">Jing Nie</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Lihui Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Haijun Liu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+X">Xichuan Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.14633v1-abstract-short" style="display: inline;"> Due to its efficiency, Post-Training Quantization (PTQ) has been widely adopted for compressing Vision Transformers (ViTs). However, when quantized into low-bit representations, there is often a significant performance drop compared to their full-precision counterparts. To address this issue, reconstruction methods have been incorporated into the PTQ framework to improve performance in low-bit qua… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.14633v1-abstract-full').style.display = 'inline'; document.getElementById('2412.14633v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.14633v1-abstract-full" style="display: none;"> Due to its efficiency, Post-Training Quantization (PTQ) has been widely adopted for compressing Vision Transformers (ViTs). However, when quantized into low-bit representations, there is often a significant performance drop compared to their full-precision counterparts. To address this issue, reconstruction methods have been incorporated into the PTQ framework to improve performance in low-bit quantization settings. Nevertheless, existing related methods predefine the reconstruction granularity and seldom explore the progressive relationships between different reconstruction granularities, which leads to sub-optimal quantization results in ViTs. To this end, in this paper, we propose a Progressive Fine-to-Coarse Reconstruction (PFCR) method for accurate PTQ, which significantly improves the performance of low-bit quantized vision transformers. Specifically, we define multi-head self-attention and multi-layer perceptron modules along with their shortcuts as the finest reconstruction units. After reconstructing these two fine-grained units, we combine them to form coarser blocks and reconstruct them at a coarser granularity level. We iteratively perform this combination and reconstruction process, achieving progressive fine-to-coarse reconstruction. Additionally, we introduce a Progressive Optimization Strategy (POS) for PFCR to alleviate the difficulty of training, thereby further enhancing model performance. Experimental results on the ImageNet dataset demonstrate that our proposed method achieves the best Top-1 accuracy among state-of-the-art methods, particularly attaining 75.61% for 3-bit quantized ViT-B in PTQ. Besides, quantization results on the COCO dataset reveal the effectiveness and generalization of our proposed method on other computer vision tasks like object detection and instance segmentation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.14633v1-abstract-full').style.display = 'none'; document.getElementById('2412.14633v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.12501">arXiv:2412.12501</a> <span> [<a href="https://arxiv.org/pdf/2412.12501">pdf</a>, <a href="https://arxiv.org/format/2412.12501">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Unleashing the Potential of Model Bias for Generalized Category Discovery </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=An%2C+W">Wenbin An</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H">Haonan Lin</a>, <a href="/search/cs?searchtype=author&query=Nie%2C+J">Jiahao Nie</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+F">Feng Tian</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+W">Wenkai Shi</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yaqiang Wu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qianying Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+P">Ping Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.12501v1-abstract-short" style="display: inline;"> Generalized Category Discovery is a significant and complex task that aims to identify both known and undefined novel categories from a set of unlabeled data, leveraging another labeled dataset containing only known categories. The primary challenges stem from model bias induced by pre-training on only known categories and the lack of precise supervision for novel ones, leading to category bias to… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.12501v1-abstract-full').style.display = 'inline'; document.getElementById('2412.12501v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.12501v1-abstract-full" style="display: none;"> Generalized Category Discovery is a significant and complex task that aims to identify both known and undefined novel categories from a set of unlabeled data, leveraging another labeled dataset containing only known categories. The primary challenges stem from model bias induced by pre-training on only known categories and the lack of precise supervision for novel ones, leading to category bias towards known categories and category confusion among different novel categories, which hinders models' ability to identify novel categories effectively. To address these challenges, we propose a novel framework named Self-Debiasing Calibration (SDC). Unlike prior methods that regard model bias towards known categories as an obstacle to novel category identification, SDC provides a novel insight into unleashing the potential of the bias to facilitate novel category learning. Specifically, the output of the biased model serves two key purposes. First, it provides an accurate modeling of category bias, which can be utilized to measure the degree of bias and debias the output of the current training model. Second, it offers valuable insights for distinguishing different novel categories by transferring knowledge between similar categories. Based on these insights, SDC dynamically adjusts the output logits of the current training model using the output of the biased model. This approach produces less biased logits to effectively address the issue of category bias towards known categories, and generates more accurate pseudo labels for unlabeled data, thereby mitigating category confusion for novel categories. Experiments on three benchmark datasets show that SDC outperforms SOTA methods, especially in the identification of novel categories. Our code and data are available at \url{https://github.com/Lackel/SDC}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.12501v1-abstract-full').style.display = 'none'; document.getElementById('2412.12501v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by AAAI 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.07998">arXiv:2412.07998</a> <span> [<a href="https://arxiv.org/pdf/2412.07998">pdf</a>, <a href="https://arxiv.org/ps/2412.07998">ps</a>, <a href="https://arxiv.org/format/2412.07998">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> RALI@TREC iKAT 2024: Achieving Personalization via Retrieval Fusion in Conversational Search </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hui%2C+Y">Yuchen Hui</a>, <a href="/search/cs?searchtype=author&query=Mo%2C+F">Fengran Mo</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+M">Milan Mao</a>, <a href="/search/cs?searchtype=author&query=Nie%2C+J">Jian-Yun Nie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.07998v1-abstract-short" style="display: inline;"> The Recherche Appliquee en Linguistique Informatique (RALI) team participated in the 2024 TREC Interactive Knowledge Assistance (iKAT) Track. In personalized conversational search, effectively capturing a user's complex search intent requires incorporating both contextual information and key elements from the user profile into query reformulation. The user profile often contains many relevant piec… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.07998v1-abstract-full').style.display = 'inline'; document.getElementById('2412.07998v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.07998v1-abstract-full" style="display: none;"> The Recherche Appliquee en Linguistique Informatique (RALI) team participated in the 2024 TREC Interactive Knowledge Assistance (iKAT) Track. In personalized conversational search, effectively capturing a user's complex search intent requires incorporating both contextual information and key elements from the user profile into query reformulation. The user profile often contains many relevant pieces, and each could potentially complement the user's information needs. It is difficult to disregard any of them, whereas introducing an excessive number of these pieces risks drifting from the original query and hinders search performance. This is a challenge we denote as over-personalization. To address this, we propose different strategies by fusing ranking lists generated from the queries with different levels of personalization. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.07998v1-abstract-full').style.display = 'none'; document.getElementById('2412.07998v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Work presented at NIST Text Retrieval Conference 2024. https://www.nist.gov/news-events/events/2024/11/trec2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.07168">arXiv:2412.07168</a> <span> [<a href="https://arxiv.org/pdf/2412.07168">pdf</a>, <a href="https://arxiv.org/format/2412.07168">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> 3A-YOLO: New Real-Time Object Detectors with Triple Discriminative Awareness and Coordinated Representations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+X">Xuecheng Wu</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+J">Junxiao Xue</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+L">Liangyu Fu</a>, <a href="/search/cs?searchtype=author&query=Nie%2C+J">Jiayu Nie</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+D">Danlei Huang</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+X">Xinyi Yin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.07168v1-abstract-short" style="display: inline;"> Recent research on real-time object detectors (e.g., YOLO series) has demonstrated the effectiveness of attention mechanisms for elevating model performance. Nevertheless, existing methods neglect to unifiedly deploy hierarchical attention mechanisms to construct a more discriminative YOLO head which is enriched with more useful intermediate features. To tackle this gap, this work aims to leverage… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.07168v1-abstract-full').style.display = 'inline'; document.getElementById('2412.07168v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.07168v1-abstract-full" style="display: none;"> Recent research on real-time object detectors (e.g., YOLO series) has demonstrated the effectiveness of attention mechanisms for elevating model performance. Nevertheless, existing methods neglect to unifiedly deploy hierarchical attention mechanisms to construct a more discriminative YOLO head which is enriched with more useful intermediate features. To tackle this gap, this work aims to leverage multiple attention mechanisms to hierarchically enhance the triple discriminative awareness of the YOLO detection head and complementarily learn the coordinated intermediate representations, resulting in a new series detectors denoted 3A-YOLO. Specifically, we first propose a new head denoted TDA-YOLO Module, which unifiedly enhance the representations learning of scale-awareness, spatial-awareness, and task-awareness. Secondly, we steer the intermediate features to coordinately learn the inter-channel relationships and precise positional information. Finally, we perform neck network improvements followed by introducing various tricks to boost the adaptability of 3A-YOLO. Extensive experiments across COCO and VOC benchmarks indicate the effectiveness of our detectors. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.07168v1-abstract-full').style.display = 'none'; document.getElementById('2412.07168v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.05897">arXiv:2412.05897</a> <span> [<a href="https://arxiv.org/pdf/2412.05897">pdf</a>, <a href="https://arxiv.org/format/2412.05897">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Detecting Discrepancies Between AI-Generated and Natural Images Using Uncertainty </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Nie%2C+J">Jun Nie</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yonggang Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+T">Tongliang Liu</a>, <a href="/search/cs?searchtype=author&query=Cheung%2C+Y">Yiu-ming Cheung</a>, <a href="/search/cs?searchtype=author&query=Han%2C+B">Bo Han</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+X">Xinmei Tian</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.05897v1-abstract-short" style="display: inline;"> In this work, we propose a novel approach for detecting AI-generated images by leveraging predictive uncertainty to mitigate misuse and associated risks. The motivation arises from the fundamental assumption regarding the distributional discrepancy between natural and AI-generated images. The feasibility of distinguishing natural images from AI-generated ones is grounded in the distribution discre… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.05897v1-abstract-full').style.display = 'inline'; document.getElementById('2412.05897v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.05897v1-abstract-full" style="display: none;"> In this work, we propose a novel approach for detecting AI-generated images by leveraging predictive uncertainty to mitigate misuse and associated risks. The motivation arises from the fundamental assumption regarding the distributional discrepancy between natural and AI-generated images. The feasibility of distinguishing natural images from AI-generated ones is grounded in the distribution discrepancy between them. Predictive uncertainty offers an effective approach for capturing distribution shifts, thereby providing insights into detecting AI-generated images. Namely, as the distribution shift between training and testing data increases, model performance typically degrades, often accompanied by increased predictive uncertainty. Therefore, we propose to employ predictive uncertainty to reflect the discrepancies between AI-generated and natural images. In this context, the challenge lies in ensuring that the model has been trained over sufficient natural images to avoid the risk of determining the distribution of natural images as that of generated images. We propose to leverage large-scale pre-trained models to calculate the uncertainty as the score for detecting AI-generated images. This leads to a simple yet effective method for detecting AI-generated images using large-scale vision models: images that induce high uncertainty are identified as AI-generated. Comprehensive experiments across multiple benchmarks demonstrate the effectiveness of our method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.05897v1-abstract-full').style.display = 'none'; document.getElementById('2412.05897v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.02575">arXiv:2412.02575</a> <span> [<a href="https://arxiv.org/pdf/2412.02575">pdf</a>, <a href="https://arxiv.org/format/2412.02575">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> Copy-Move Forgery Detection and Question Answering for Remote Sensing Image </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Ze Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+E">Enyuan Zhao</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+Z">Ziyi Wan</a>, <a href="/search/cs?searchtype=author&query=Nie%2C+J">Jie Nie</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+X">Xinyue Liang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+L">Lei Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.02575v1-abstract-short" style="display: inline;"> This paper introduces the task of Remote Sensing Copy-Move Question Answering (RSCMQA). Unlike traditional Remote Sensing Visual Question Answering (RSVQA), RSCMQA focuses on interpreting complex tampering scenarios and inferring relationships between objects. Based on the practical needs of national defense security and land resource monitoring, we have developed an accurate and comprehensive glo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.02575v1-abstract-full').style.display = 'inline'; document.getElementById('2412.02575v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.02575v1-abstract-full" style="display: none;"> This paper introduces the task of Remote Sensing Copy-Move Question Answering (RSCMQA). Unlike traditional Remote Sensing Visual Question Answering (RSVQA), RSCMQA focuses on interpreting complex tampering scenarios and inferring relationships between objects. Based on the practical needs of national defense security and land resource monitoring, we have developed an accurate and comprehensive global dataset for remote sensing image copy-move question answering, named RS-CMQA-2.1M. These images were collected from 29 different regions across 14 countries. Additionally, we have refined a balanced dataset, RS-CMQA-B, to address the long-standing issue of long-tail data in the remote sensing field. Furthermore, we propose a region-discriminative guided multimodal CMQA model, which enhances the accuracy of answering questions about tampered images by leveraging prompt about the differences and connections between the source and tampered domains. Extensive experiments demonstrate that our method provides a stronger benchmark for RS-CMQA compared to general VQA and RSVQA models. Our dataset and code are available at https://github.com/shenyedepisa/RSCMQA. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.02575v1-abstract-full').style.display = 'none'; document.getElementById('2412.02575v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 figs, 7 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09852">arXiv:2411.09852</a> <span> [<a href="https://arxiv.org/pdf/2411.09852">pdf</a>, <a href="https://arxiv.org/format/2411.09852">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> InterFormer: Towards Effective Heterogeneous Interaction Learning for Click-Through Rate Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zeng%2C+Z">Zhichen Zeng</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiaolong Liu</a>, <a href="/search/cs?searchtype=author&query=Hang%2C+M">Mengyue Hang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiaoyi Liu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Q">Qinghai Zhou</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+C">Chaofei Yang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yiqun Liu</a>, <a href="/search/cs?searchtype=author&query=Ruan%2C+Y">Yichen Ruan</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Laming Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yuxin Chen</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+Y">Yujia Hao</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jiaqi Xu</a>, <a href="/search/cs?searchtype=author&query=Nie%2C+J">Jade Nie</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xi Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Buyun Zhang</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+W">Wei Wen</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+S">Siyang Yuan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+K">Kai Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wen-Yen Chen</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Y">Yiping Han</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Huayu Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+C">Chunzhi Yang</a>, <a href="/search/cs?searchtype=author&query=Long%2C+B">Bo Long</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+P+S">Philip S. Yu</a>, <a href="/search/cs?searchtype=author&query=Tong%2C+H">Hanghang Tong</a> , et al. (1 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09852v2-abstract-short" style="display: inline;"> Click-through rate (CTR) prediction, which predicts the probability of a user clicking an ad, is a fundamental task in recommender systems. The emergence of heterogeneous information, such as user profile and behavior sequences, depicts user interests from different aspects. A mutually beneficial integration of heterogeneous information is the cornerstone towards the success of CTR prediction. How… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09852v2-abstract-full').style.display = 'inline'; document.getElementById('2411.09852v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09852v2-abstract-full" style="display: none;"> Click-through rate (CTR) prediction, which predicts the probability of a user clicking an ad, is a fundamental task in recommender systems. The emergence of heterogeneous information, such as user profile and behavior sequences, depicts user interests from different aspects. A mutually beneficial integration of heterogeneous information is the cornerstone towards the success of CTR prediction. However, most of the existing methods suffer from two fundamental limitations, including (1) insufficient inter-mode interaction due to the unidirectional information flow between modes, and (2) aggressive information aggregation caused by early summarization, resulting in excessive information loss. To address the above limitations, we propose a novel module named InterFormer to learn heterogeneous information interaction in an interleaving style. To achieve better interaction learning, InterFormer enables bidirectional information flow for mutually beneficial learning across different modes. To avoid aggressive information aggregation, we retain complete information in each data mode and use a separate bridging arch for effective information selection and summarization. Our proposed InterFormer achieves state-of-the-art performance on three public datasets and a large-scale industrial dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09852v2-abstract-full').style.display = 'none'; document.getElementById('2411.09852v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.21017">arXiv:2410.21017</a> <span> [<a href="https://arxiv.org/pdf/2410.21017">pdf</a>, <a href="https://arxiv.org/format/2410.21017">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> </div> </div> <p class="title is-5 mathjax"> Edge Perception: Intelligent Wireless Sensing at Network Edge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cui%2C+Y">Yuanhao Cui</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+X">Xiaowen Cao</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+G">Guangxu Zhu</a>, <a href="/search/cs?searchtype=author&query=Nie%2C+J">Jiali Nie</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jie Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.21017v1-abstract-short" style="display: inline;"> Future sixth-generation (6G) networks are envisioned to support intelligent applications across various vertical scenarios, which have stringent requirements on high-precision sensing as well as ultra-low-latency data processing and decision making. Towards this end, a new paradigm of edge perception networks emerges, which integrates wireless sensing, communication, computation, and artificial in… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21017v1-abstract-full').style.display = 'inline'; document.getElementById('2410.21017v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.21017v1-abstract-full" style="display: none;"> Future sixth-generation (6G) networks are envisioned to support intelligent applications across various vertical scenarios, which have stringent requirements on high-precision sensing as well as ultra-low-latency data processing and decision making. Towards this end, a new paradigm of edge perception networks emerges, which integrates wireless sensing, communication, computation, and artificial intelligence (AI) capabilities at network edge for intelligent sensing and data processing. This article provides a timely overview on this emerging topic. We commence by discussing wireless edge perception, including physical layer transceiver design, network-wise cooperation, and application-specific data analytics, for which the prospects and challenges are emphasized. Next, we discuss the interplay between edge AI and wireless sensing in edge perception, and present various key techniques for two paradigms, namely edge AI empowered sensing and task-oriented sensing for edge AI, respectively. Finally, we emphasize interesting research directions on edge perception to motivate future works. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21017v1-abstract-full').style.display = 'none'; document.getElementById('2410.21017v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.15576">arXiv:2410.15576</a> <span> [<a href="https://arxiv.org/pdf/2410.15576">pdf</a>, <a href="https://arxiv.org/format/2410.15576">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> A Survey of Conversational Search </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mo%2C+F">Fengran Mo</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+K">Kelong Mao</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Ziliang Zhao</a>, <a href="/search/cs?searchtype=author&query=Qian%2C+H">Hongjin Qian</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Haonan Chen</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+Y">Yiruo Cheng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiaoxi Li</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yutao Zhu</a>, <a href="/search/cs?searchtype=author&query=Dou%2C+Z">Zhicheng Dou</a>, <a href="/search/cs?searchtype=author&query=Nie%2C+J">Jian-Yun Nie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.15576v1-abstract-short" style="display: inline;"> As a cornerstone of modern information access, search engines have become indispensable in everyday life. With the rapid advancements in AI and natural language processing (NLP) technologies, particularly large language models (LLMs), search engines have evolved to support more intuitive and intelligent interactions between users and systems. Conversational search, an emerging paradigm for next-ge… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15576v1-abstract-full').style.display = 'inline'; document.getElementById('2410.15576v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.15576v1-abstract-full" style="display: none;"> As a cornerstone of modern information access, search engines have become indispensable in everyday life. With the rapid advancements in AI and natural language processing (NLP) technologies, particularly large language models (LLMs), search engines have evolved to support more intuitive and intelligent interactions between users and systems. Conversational search, an emerging paradigm for next-generation search engines, leverages natural language dialogue to facilitate complex and precise information retrieval, thus attracting significant attention. Unlike traditional keyword-based search engines, conversational search systems enhance user experience by supporting intricate queries, maintaining context over multi-turn interactions, and providing robust information integration and processing capabilities. Key components such as query reformulation, search clarification, conversational retrieval, and response generation work in unison to enable these sophisticated interactions. In this survey, we explore the recent advancements and potential future directions in conversational search, examining the critical modules that constitute a conversational search system. We highlight the integration of LLMs in enhancing these systems and discuss the challenges and opportunities that lie ahead in this dynamic field. Additionally, we provide insights into real-world applications and robust evaluations of current conversational search systems, aiming to guide future research and development in conversational search. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15576v1-abstract-full').style.display = 'none'; document.getElementById('2410.15576v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">35 pages, 8 figures, continue to update</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.01176">arXiv:2410.01176</a> <span> [<a href="https://arxiv.org/pdf/2410.01176">pdf</a>, <a href="https://arxiv.org/format/2410.01176">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Generative Diffusion-based Contract Design for Efficient AI Twins Migration in Vehicular Embodied AI Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhong%2C+Y">Yue Zhong</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+J">Jiawen Kang</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+J">Jinbo Wen</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+D">Dongdong Ye</a>, <a href="/search/cs?searchtype=author&query=Nie%2C+J">Jiangtian Nie</a>, <a href="/search/cs?searchtype=author&query=Niyato%2C+D">Dusit Niyato</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+X">Xiaozheng Gao</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+S">Shengli Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.01176v1-abstract-short" style="display: inline;"> Embodied AI is a rapidly advancing field that bridges the gap between cyberspace and physical space, enabling a wide range of applications. This evolution has led to the development of the Vehicular Embodied AI NETwork (VEANET), where advanced AI capabilities are integrated into vehicular systems to enhance autonomous operations and decision-making. Embodied agents, such as Autonomous Vehicles (AV… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01176v1-abstract-full').style.display = 'inline'; document.getElementById('2410.01176v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.01176v1-abstract-full" style="display: none;"> Embodied AI is a rapidly advancing field that bridges the gap between cyberspace and physical space, enabling a wide range of applications. This evolution has led to the development of the Vehicular Embodied AI NETwork (VEANET), where advanced AI capabilities are integrated into vehicular systems to enhance autonomous operations and decision-making. Embodied agents, such as Autonomous Vehicles (AVs), are autonomous entities that can perceive their environment and take actions to achieve specific goals, actively interacting with the physical world. Embodied twins are digital models of these embodied agents, with various embodied AI twins for intelligent applications in cyberspace. In VEANET, embodied AI twins act as in-vehicle AI assistants to perform diverse tasks supporting autonomous driving using generative AI models. Due to limited computational resources of AVs, these AVs often offload computationally intensive tasks, such as constructing and updating embodied AI twins, to nearby RSUs. However, since the rapid mobility of AVs and the limited provision coverage of a single RSU, embodied AI twins require dynamic migrations from current RSU to other RSUs in real-time, resulting in the challenge of selecting suitable RSUs for efficient embodied AI twins migrations. Given information asymmetry, AVs cannot know the detailed information of RSUs. To this end, in this paper, we construct a multi-dimensional contract theoretical model between AVs and alternative RSUs. Considering that AVs may exhibit irrational behavior, we utilize prospect theory instead of expected utility theory to model the actual utilities of AVs. Finally, we employ a generative diffusion model-based algorithm to identify the optimal contract designs. Compared with traditional deep reinforcement learning algorithms, numerical results demonstrate the effectiveness of the proposed scheme. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01176v1-abstract-full').style.display = 'none'; document.getElementById('2410.01176v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.00057">arXiv:2410.00057</a> <span> [<a href="https://arxiv.org/pdf/2410.00057">pdf</a>, <a href="https://arxiv.org/format/2410.00057">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> STTM: A New Approach Based Spatial-Temporal Transformer And Memory Network For Real-time Pressure Signal In On-demand Food Delivery </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jiang Wang</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+H">Haibin Wei</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xiaowei Xu</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+J">Jiacheng Shi</a>, <a href="/search/cs?searchtype=author&query=Nie%2C+J">Jian Nie</a>, <a href="/search/cs?searchtype=author&query=Du%2C+L">Longzhi Du</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+T">Taixu Jiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.00057v1-abstract-short" style="display: inline;"> On-demand Food Delivery (OFD) services have become very common around the world. For example, on the Ele.me platform, users place more than 15 million food orders every day. Predicting the Real-time Pressure Signal (RPS) is crucial for OFD services, as it is primarily used to measure the current status of pressure on the logistics system. When RPS rises, the pressure increases, and the platform ne… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00057v1-abstract-full').style.display = 'inline'; document.getElementById('2410.00057v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.00057v1-abstract-full" style="display: none;"> On-demand Food Delivery (OFD) services have become very common around the world. For example, on the Ele.me platform, users place more than 15 million food orders every day. Predicting the Real-time Pressure Signal (RPS) is crucial for OFD services, as it is primarily used to measure the current status of pressure on the logistics system. When RPS rises, the pressure increases, and the platform needs to quickly take measures to prevent the logistics system from being overloaded. Usually, the average delivery time for all orders within a business district is used to represent RPS. Existing research on OFD services primarily focuses on predicting the delivery time of orders, while relatively less attention has been given to the study of the RPS. Previous research directly applies general models such as DeepFM, RNN, and GNN for prediction, but fails to adequately utilize the unique temporal and spatial characteristics of OFD services, and faces issues with insufficient sensitivity during sudden severe weather conditions or peak periods. To address these problems, this paper proposes a new method based on Spatio-Temporal Transformer and Memory Network (STTM). Specifically, we use a novel Spatio-Temporal Transformer structure to learn logistics features across temporal and spatial dimensions and encode the historical information of a business district and its neighbors, thereby learning both temporal and spatial information. Additionally, a Memory Network is employed to increase sensitivity to abnormal events. Experimental results on the real-world dataset show that STTM significantly outperforms previous methods in both offline experiments and the online A/B test, demonstrating the effectiveness of this method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00057v1-abstract-full').style.display = 'none'; document.getElementById('2410.00057v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.04050">arXiv:2409.04050</a> <span> [<a href="https://arxiv.org/pdf/2409.04050">pdf</a>, <a href="https://arxiv.org/format/2409.04050">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> EigenSR: Eigenimage-Bridged Pre-Trained RGB Learners for Single Hyperspectral Image Super-Resolution </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Su%2C+X">Xi Su</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+X">Xiangfei Shen</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+M">Mingyang Wan</a>, <a href="/search/cs?searchtype=author&query=Nie%2C+J">Jing Nie</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Lihui Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Haijun Liu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+X">Xichuan Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.04050v2-abstract-short" style="display: inline;"> Single hyperspectral image super-resolution (single-HSI-SR) aims to improve the resolution of a single input low-resolution HSI. Due to the bottleneck of data scarcity, the development of single-HSI-SR lags far behind that of RGB natural images. In recent years, research on RGB SR has shown that models pre-trained on large-scale benchmark datasets can greatly improve performance on unseen data, wh… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.04050v2-abstract-full').style.display = 'inline'; document.getElementById('2409.04050v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.04050v2-abstract-full" style="display: none;"> Single hyperspectral image super-resolution (single-HSI-SR) aims to improve the resolution of a single input low-resolution HSI. Due to the bottleneck of data scarcity, the development of single-HSI-SR lags far behind that of RGB natural images. In recent years, research on RGB SR has shown that models pre-trained on large-scale benchmark datasets can greatly improve performance on unseen data, which may stand as a remedy for HSI. But how can we transfer the pre-trained RGB model to HSI, to overcome the data-scarcity bottleneck? Because of the significant difference in the channels between the pre-trained RGB model and the HSI, the model cannot focus on the correlation along the spectral dimension, thus limiting its ability to utilize on HSI. Inspired by the HSI spatial-spectral decoupling, we propose a new framework that first fine-tunes the pre-trained model with the spatial components (known as eigenimages), and then infers on unseen HSI using an iterative spectral regularization (ISR) to maintain the spectral correlation. The advantages of our method lie in: 1) we effectively inject the spatial texture processing capabilities of the pre-trained RGB model into HSI while keeping spectral fidelity, 2) learning in the spectral-decorrelated domain can improve the generalizability to spectral-agnostic data, and 3) our inference in the eigenimage domain naturally exploits the spectral low-rank property of HSI, thereby reducing the complexity. This work bridges the gap between pre-trained RGB models and HSI via eigenimages, addressing the issue of limited HSI training data, hence the name EigenSR. Extensive experiments show that EigenSR outperforms the state-of-the-art (SOTA) methods in both spatial and spectral metrics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.04050v2-abstract-full').style.display = 'none'; document.getElementById('2409.04050v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">AAAI 2025 conference paper</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.16031">arXiv:2408.16031</a> <span> [<a href="https://arxiv.org/pdf/2408.16031">pdf</a>, <a href="https://arxiv.org/format/2408.16031">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> EMP: Enhance Memory in Data Pruning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiao%2C+J">Jinying Xiao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+P">Ping Li</a>, <a href="/search/cs?searchtype=author&query=Nie%2C+J">Jie Nie</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Z">Zhe Tang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.16031v1-abstract-short" style="display: inline;"> Recently, large language and vision models have shown strong performance, but due to high pre-training and fine-tuning costs, research has shifted towards faster training via dataset pruning. Previous methods used sample loss as an evaluation criterion, aiming to select the most "difficult" samples for training. However, when the pruning rate increases, the number of times each sample is trained b… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.16031v1-abstract-full').style.display = 'inline'; document.getElementById('2408.16031v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.16031v1-abstract-full" style="display: none;"> Recently, large language and vision models have shown strong performance, but due to high pre-training and fine-tuning costs, research has shifted towards faster training via dataset pruning. Previous methods used sample loss as an evaluation criterion, aiming to select the most "difficult" samples for training. However, when the pruning rate increases, the number of times each sample is trained becomes more evenly distributed, which causes many critical or general samples to not be effectively fitted. We refer to this as Low-Frequency Learning (LFL). In other words, LFL prevents the model from remembering most samples. In our work, we decompose the scoring function of LFL, provide a theoretical explanation for the inefficiency of LFL, and propose adding a memory term to the scoring function to enhance the model's memory capability, along with an approximation of this memory term. Similarly, we explore memory in Self-Supervised Learning (SSL), marking the first discussion on SSL memory. Using contrastive learning, we derive the memory term both theoretically and experimentally. Finally, we propose Enhance Memory Pruning (EMP), which addresses the issue of insufficient memory under high pruning rates by enhancing the model's memory of data, thereby improving its performance. We evaluated the performance of EMP in tasks such as image classification, natural language understanding, and model pre-training. The results show that EMP can improve model performance under extreme pruning rates. For example, in the CIFAR100-ResNet50 pre-training task, with 70\% pruning, EMP outperforms current methods by 2.2\%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.16031v1-abstract-full').style.display = 'none'; document.getElementById('2408.16031v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.11878">arXiv:2408.11878</a> <span> [<a href="https://arxiv.org/pdf/2408.11878">pdf</a>, <a href="https://arxiv.org/format/2408.11878">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computational Engineering, Finance, and Science">cs.CE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computational Finance">q-fin.CP</span> </div> </div> <p class="title is-5 mathjax"> Open-FinLLMs: Open Multimodal Large Language Models for Financial Applications </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xie%2C+Q">Qianqian Xie</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Dong Li</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+M">Mengxi Xiao</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Zihao Jiang</a>, <a href="/search/cs?searchtype=author&query=Xiang%2C+R">Ruoyu Xiang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiao Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhengyu Chen</a>, <a href="/search/cs?searchtype=author&query=He%2C+Y">Yueru He</a>, <a href="/search/cs?searchtype=author&query=Han%2C+W">Weiguang Han</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yuzhe Yang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Shunian Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yifei Zhang</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+L">Lihang Shen</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+D">Daniel Kim</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhiwei Liu</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+Z">Zheheng Luo</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Y">Yangyang Yu</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+Y">Yupeng Cao</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+Z">Zhiyang Deng</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+Z">Zhiyuan Yao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Haohang Li</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+D">Duanyu Feng</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+Y">Yongfu Dai</a>, <a href="/search/cs?searchtype=author&query=Somasundaram%2C+V">VijayaSai Somasundaram</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+P">Peng Lu</a> , et al. (14 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.11878v1-abstract-short" style="display: inline;"> Large language models (LLMs) have advanced financial applications, yet they often lack sufficient financial knowledge and struggle with tasks involving multi-modal inputs like tables and time series data. To address these limitations, we introduce \textit{Open-FinLLMs}, a series of Financial LLMs. We begin with FinLLaMA, pre-trained on a 52 billion token financial corpus, incorporating text, table… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.11878v1-abstract-full').style.display = 'inline'; document.getElementById('2408.11878v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.11878v1-abstract-full" style="display: none;"> Large language models (LLMs) have advanced financial applications, yet they often lack sufficient financial knowledge and struggle with tasks involving multi-modal inputs like tables and time series data. To address these limitations, we introduce \textit{Open-FinLLMs}, a series of Financial LLMs. We begin with FinLLaMA, pre-trained on a 52 billion token financial corpus, incorporating text, tables, and time-series data to embed comprehensive financial knowledge. FinLLaMA is then instruction fine-tuned with 573K financial instructions, resulting in FinLLaMA-instruct, which enhances task performance. Finally, we present FinLLaVA, a multimodal LLM trained with 1.43M image-text instructions to handle complex financial data types. Extensive evaluations demonstrate FinLLaMA's superior performance over LLaMA3-8B, LLaMA3.1-8B, and BloombergGPT in both zero-shot and few-shot settings across 19 and 4 datasets, respectively. FinLLaMA-instruct outperforms GPT-4 and other Financial LLMs on 15 datasets. FinLLaVA excels in understanding tables and charts across 4 multimodal tasks. Additionally, FinLLaMA achieves impressive Sharpe Ratios in trading simulations, highlighting its robust financial application capabilities. We will continually maintain and improve our models and benchmarks to support ongoing innovation in academia and industry. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.11878v1-abstract-full').style.display = 'none'; document.getElementById('2408.11878v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">33 pages, 13 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.02263">arXiv:2408.02263</a> <span> [<a href="https://arxiv.org/pdf/2408.02263">pdf</a>, <a href="https://arxiv.org/format/2408.02263">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> VoxelTrack: Exploring Voxel Representation for 3D Point Cloud Object Tracking </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lu%2C+Y">Yuxuan Lu</a>, <a href="/search/cs?searchtype=author&query=Nie%2C+J">Jiahao Nie</a>, <a href="/search/cs?searchtype=author&query=He%2C+Z">Zhiwei He</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+H">Hongjie Gu</a>, <a href="/search/cs?searchtype=author&query=Lv%2C+X">Xudong Lv</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.02263v1-abstract-short" style="display: inline;"> Current LiDAR point cloud-based 3D single object tracking (SOT) methods typically rely on point-based representation network. Despite demonstrated success, such networks suffer from some fundamental problems: 1) It contains pooling operation to cope with inherently disordered point clouds, hindering the capture of 3D spatial information that is useful for tracking, a regression task. 2) The adopte… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.02263v1-abstract-full').style.display = 'inline'; document.getElementById('2408.02263v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.02263v1-abstract-full" style="display: none;"> Current LiDAR point cloud-based 3D single object tracking (SOT) methods typically rely on point-based representation network. Despite demonstrated success, such networks suffer from some fundamental problems: 1) It contains pooling operation to cope with inherently disordered point clouds, hindering the capture of 3D spatial information that is useful for tracking, a regression task. 2) The adopted set abstraction operation hardly handles density-inconsistent point clouds, also preventing 3D spatial information from being modeled. To solve these problems, we introduce a novel tracking framework, termed VoxelTrack. By voxelizing inherently disordered point clouds into 3D voxels and extracting their features via sparse convolution blocks, VoxelTrack effectively models precise and robust 3D spatial information, thereby guiding accurate position prediction for tracked objects. Moreover, VoxelTrack incorporates a dual-stream encoder with cross-iterative feature fusion module to further explore fine-grained 3D spatial information for tracking. Benefiting from accurate 3D spatial information being modeled, our VoxelTrack simplifies tracking pipeline with a single regression loss. Extensive experiments are conducted on three widely-adopted datasets including KITTI, NuScenes and Waymo Open Dataset. The experimental results confirm that VoxelTrack achieves state-of-the-art performance (88.3%, 71.4% and 63.6% mean precision on the three datasets, respectively), and outperforms the existing trackers with a real-time speed of 36 Fps on a single TITAN RTX GPU. The source code and model will be released. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.02263v1-abstract-full').style.display = 'none'; document.getElementById('2408.02263v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.20189">arXiv:2407.20189</a> <span> [<a href="https://arxiv.org/pdf/2407.20189">pdf</a>, <a href="https://arxiv.org/format/2407.20189">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Aligning Query Representation with Rewritten Query and Relevance Judgments in Conversational Search </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mo%2C+F">Fengran Mo</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+C">Chen Qu</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+K">Kelong Mao</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yihong Wu</a>, <a href="/search/cs?searchtype=author&query=Su%2C+Z">Zhan Su</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+K">Kaiyu Huang</a>, <a href="/search/cs?searchtype=author&query=Nie%2C+J">Jian-Yun Nie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.20189v1-abstract-short" style="display: inline;"> Conversational search supports multi-turn user-system interactions to solve complex information needs. Different from the traditional single-turn ad-hoc search, conversational search encounters a more challenging problem of context-dependent query understanding with the lengthy and long-tail conversational history context. While conversational query rewriting methods leverage explicit rewritten qu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.20189v1-abstract-full').style.display = 'inline'; document.getElementById('2407.20189v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.20189v1-abstract-full" style="display: none;"> Conversational search supports multi-turn user-system interactions to solve complex information needs. Different from the traditional single-turn ad-hoc search, conversational search encounters a more challenging problem of context-dependent query understanding with the lengthy and long-tail conversational history context. While conversational query rewriting methods leverage explicit rewritten queries to train a rewriting model to transform the context-dependent query into a stand-stone search query, this is usually done without considering the quality of search results. Conversational dense retrieval methods use fine-tuning to improve a pre-trained ad-hoc query encoder, but they are limited by the conversational search data available for training. In this paper, we leverage both rewritten queries and relevance judgments in the conversational search data to train a better query representation model. The key idea is to align the query representation with those of rewritten queries and relevant documents. The proposed model -- Query Representation Alignment Conversational Dense Retriever, QRACDR, is tested on eight datasets, including various settings in conversational search and ad-hoc search. The results demonstrate the strong performance of QRACDR compared with state-of-the-art methods, and confirm the effectiveness of representation alignment. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.20189v1-abstract-full').style.display = 'none'; document.getElementById('2407.20189v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by CIKM 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.18424">arXiv:2407.18424</a> <span> [<a href="https://arxiv.org/pdf/2407.18424">pdf</a>, <a href="https://arxiv.org/format/2407.18424">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Model-driven Heart Rate Estimation and Heart Murmur Detection based on Phonocardiogram </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Nie%2C+J">Jingping Nie</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+R">Ran Liu</a>, <a href="/search/cs?searchtype=author&query=Mahasseni%2C+B">Behrooz Mahasseni</a>, <a href="/search/cs?searchtype=author&query=Azemi%2C+E">Erdrin Azemi</a>, <a href="/search/cs?searchtype=author&query=Mitra%2C+V">Vikramjit Mitra</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.18424v1-abstract-short" style="display: inline;"> Acoustic signals are crucial for health monitoring, particularly heart sounds which provide essential data like heart rate and detect cardiac anomalies such as murmurs. This study utilizes a publicly available phonocardiogram (PCG) dataset to estimate heart rate using model-driven methods and extends the best-performing model to a multi-task learning (MTL) framework for simultaneous heart rate est… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.18424v1-abstract-full').style.display = 'inline'; document.getElementById('2407.18424v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.18424v1-abstract-full" style="display: none;"> Acoustic signals are crucial for health monitoring, particularly heart sounds which provide essential data like heart rate and detect cardiac anomalies such as murmurs. This study utilizes a publicly available phonocardiogram (PCG) dataset to estimate heart rate using model-driven methods and extends the best-performing model to a multi-task learning (MTL) framework for simultaneous heart rate estimation and murmur detection. Heart rate estimates are derived using a sliding window technique on heart sound snippets, analyzed with a combination of acoustic features (Mel spectrogram, cepstral coefficients, power spectral density, root mean square energy). Our findings indicate that a 2D convolutional neural network (\textbf{\texttt{2dCNN}}) is most effective for heart rate estimation, achieving a mean absolute error (MAE) of 1.312 bpm. We systematically investigate the impact of different feature combinations and find that utilizing all four features yields the best results. The MTL model (\textbf{\texttt{2dCNN-MTL}}) achieves accuracy over 95% in murmur detection, surpassing existing models, while maintaining an MAE of 1.636 bpm in heart rate estimation, satisfying the requirements stated by Association for the Advancement of Medical Instrumentation (AAMI). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.18424v1-abstract-full').style.display = 'none'; document.getElementById('2407.18424v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">6 pages, 10 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.16192">arXiv:2407.16192</a> <span> [<a href="https://arxiv.org/pdf/2407.16192">pdf</a>, <a href="https://arxiv.org/format/2407.16192">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> How to Leverage Personal Textual Knowledge for Personalized Conversational Information Retrieval </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mo%2C+F">Fengran Mo</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+L">Longxiang Zhao</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+K">Kaiyu Huang</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Y">Yue Dong</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+D">Degen Huang</a>, <a href="/search/cs?searchtype=author&query=Nie%2C+J">Jian-Yun Nie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.16192v1-abstract-short" style="display: inline;"> Personalized conversational information retrieval (CIR) combines conversational and personalizable elements to satisfy various users' complex information needs through multi-turn interaction based on their backgrounds. The key promise is that the personal textual knowledge base (PTKB) can improve the CIR effectiveness because the retrieval results can be more related to the user's background. Howe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.16192v1-abstract-full').style.display = 'inline'; document.getElementById('2407.16192v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.16192v1-abstract-full" style="display: none;"> Personalized conversational information retrieval (CIR) combines conversational and personalizable elements to satisfy various users' complex information needs through multi-turn interaction based on their backgrounds. The key promise is that the personal textual knowledge base (PTKB) can improve the CIR effectiveness because the retrieval results can be more related to the user's background. However, PTKB is noisy: not every piece of knowledge in PTKB is relevant to the specific query at hand. In this paper, we explore and test several ways to select knowledge from PTKB and use it for query reformulation by using a large language model (LLM). The experimental results show the PTKB might not always improve the search results when used alone, but LLM can help generate a more appropriate personalized query when high-quality guidance is provided. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.16192v1-abstract-full').style.display = 'none'; document.getElementById('2407.16192v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to CIKM 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.15346">arXiv:2407.15346</a> <span> [<a href="https://arxiv.org/pdf/2407.15346">pdf</a>, <a href="https://arxiv.org/format/2407.15346">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> Knowledge Acquisition Disentanglement for Knowledge-based Visual Question Answering with Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=An%2C+W">Wenbin An</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+F">Feng Tian</a>, <a href="/search/cs?searchtype=author&query=Nie%2C+J">Jiahao Nie</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+W">Wenkai Shi</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H">Haonan Lin</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yan Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">QianYing Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yaqiang Wu</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+G">Guang Dai</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+P">Ping Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.15346v1-abstract-short" style="display: inline;"> Knowledge-based Visual Question Answering (KVQA) requires both image and world knowledge to answer questions. Current methods first retrieve knowledge from the image and external knowledge base with the original complex question, then generate answers with Large Language Models (LLMs). However, since the original question contains complex elements that require knowledge from different sources, acq… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.15346v1-abstract-full').style.display = 'inline'; document.getElementById('2407.15346v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.15346v1-abstract-full" style="display: none;"> Knowledge-based Visual Question Answering (KVQA) requires both image and world knowledge to answer questions. Current methods first retrieve knowledge from the image and external knowledge base with the original complex question, then generate answers with Large Language Models (LLMs). However, since the original question contains complex elements that require knowledge from different sources, acquiring different kinds of knowledge in a coupled manner may confuse models and hinder them from retrieving precise knowledge. Furthermore, the ``forward-only'' answering process fails to explicitly capture the knowledge needs of LLMs, which can further hurt answering quality. To cope with the above limitations, we propose DKA: Disentangled Knowledge Acquisition from LLM feedback, a training-free framework that disentangles knowledge acquisition to avoid confusion and uses LLM's feedback to specify the required knowledge. Specifically, DKA requires LLMs to specify what knowledge they need to answer the question and decompose the original complex question into two simple sub-questions: Image-based sub-question and Knowledge-based sub-question. Then we use the two sub-questions to retrieve knowledge from the image and knowledge base, respectively. In this way, two knowledge acquisition models can focus on the content that corresponds to them and avoid disturbance of irrelevant elements in the original complex question, which can help to provide more precise knowledge and better align the knowledge needs of LLMs to yield correct answers. Experiments on benchmark datasets show that DKA significantly outperforms SOTA models. To facilitate future research, our data and code are available at \url{https://github.com/Lackel/DKA}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.15346v1-abstract-full').style.display = 'none'; document.getElementById('2407.15346v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Pre-print</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.10979">arXiv:2407.10979</a> <span> [<a href="https://arxiv.org/pdf/2407.10979">pdf</a>, <a href="https://arxiv.org/ps/2407.10979">ps</a>, <a href="https://arxiv.org/format/2407.10979">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> Diffusion Model-based Incentive Mechanism with Prospect Theory for Edge AIGC Services in 6G IoT </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wen%2C+J">Jinbo Wen</a>, <a href="/search/cs?searchtype=author&query=Nie%2C+J">Jiangtian Nie</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+Y">Yue Zhong</a>, <a href="/search/cs?searchtype=author&query=Yi%2C+C">Changyan Yi</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiaohuan Li</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+J">Jiangming Jin</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yang Zhang</a>, <a href="/search/cs?searchtype=author&query=Niyato%2C+D">Dusit Niyato</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.10979v2-abstract-short" style="display: inline;"> The fusion of the Internet of Things (IoT) with Sixth-Generation (6G) technology has significant potential to revolutionize the IoT landscape. With the ultra-reliable and low-latency communication capabilities of 6G, 6G-IoT networks can transmit high-quality and diverse data to enhance edge learning. Artificial Intelligence-Generated Content (AIGC) harnesses advanced AI algorithms to automatically… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.10979v2-abstract-full').style.display = 'inline'; document.getElementById('2407.10979v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.10979v2-abstract-full" style="display: none;"> The fusion of the Internet of Things (IoT) with Sixth-Generation (6G) technology has significant potential to revolutionize the IoT landscape. With the ultra-reliable and low-latency communication capabilities of 6G, 6G-IoT networks can transmit high-quality and diverse data to enhance edge learning. Artificial Intelligence-Generated Content (AIGC) harnesses advanced AI algorithms to automatically generate various types of content. The emergence of edge AIGC integrates with edge networks, facilitating real-time provision of customized AIGC services by deploying AIGC models on edge devices. However, the current practice of edge devices as AIGC Service Providers (ASPs) lacks incentives, hindering the sustainable provision of high-quality edge AIGC services amidst information asymmetry. In this paper, we develop a user-centric incentive mechanism framework for edge AIGC services in 6G-IoT networks. Specifically, we first propose a contract theory model for incentivizing ASPs to provide AIGC services to clients. Recognizing the irrationality of clients towards personalized AIGC services, we utilize Prospect Theory (PT) to capture their subjective utility better. Furthermore, we adopt the diffusion-based soft actor-critic algorithm to generate the optimal contract design under PT, outperforming traditional deep reinforcement learning algorithms. Our numerical results demonstrate the effectiveness of the proposed scheme. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.10979v2-abstract-full').style.display = 'none'; document.getElementById('2407.10979v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.05238">arXiv:2407.05238</a> <span> [<a href="https://arxiv.org/pdf/2407.05238">pdf</a>, <a href="https://arxiv.org/format/2407.05238">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> P2P: Part-to-Part Motion Cues Guide a Strong Tracking Framework for LiDAR Point Clouds </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Nie%2C+J">Jiahao Nie</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+F">Fei Xie</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+S">Sifan Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+X">Xueyi Zhou</a>, <a href="/search/cs?searchtype=author&query=Chae%2C+D">Dong-Kyu Chae</a>, <a href="/search/cs?searchtype=author&query=He%2C+Z">Zhiwei He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.05238v2-abstract-short" style="display: inline;"> 3D single object tracking (SOT) methods based on appearance matching has long suffered from insufficient appearance information incurred by incomplete, textureless and semantically deficient LiDAR point clouds. While motion paradigm exploits motion cues instead of appearance matching for tracking, it incurs complex multi-stage processing and segmentation module. In this paper, we first provide in-… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.05238v2-abstract-full').style.display = 'inline'; document.getElementById('2407.05238v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.05238v2-abstract-full" style="display: none;"> 3D single object tracking (SOT) methods based on appearance matching has long suffered from insufficient appearance information incurred by incomplete, textureless and semantically deficient LiDAR point clouds. While motion paradigm exploits motion cues instead of appearance matching for tracking, it incurs complex multi-stage processing and segmentation module. In this paper, we first provide in-depth explorations on motion paradigm, which proves that (\textbf{i}) it is feasible to directly infer target relative motion from point clouds across consecutive frames; (\textbf{ii}) fine-grained information comparison between consecutive point clouds facilitates target motion modeling. We thereby propose to perform part-to-part motion modeling for consecutive point clouds and introduce a novel tracking framework, termed \textbf{P2P}. The novel framework fuses each corresponding part information between consecutive point clouds, effectively exploring detailed information changes and thus modeling accurate target-related motion cues. Following this framework, we present P2P-point and P2P-voxel models, incorporating implicit and explicit part-to-part motion modeling by point- and voxel-based representation, respectively. Without bells and whistles, P2P-voxel sets a new state-of-the-art performance ($\sim$\textbf{89\%}, \textbf{72\%} and \textbf{63\%} precision on KITTI, NuScenes and Waymo Open Dataset, respectively). Moreover, under the same point-based representation, P2P-point outperforms the previous motion tracker M$^2$Track by \textbf{3.3\%} and \textbf{6.7\%} on the KITTI and NuScenes, while running at a considerably high speed of \textbf{107 Fps} on a single RTX3090 GPU. The source code and pre-trained models are available at \url{https://github.com/haooozi/P2P}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.05238v2-abstract-full').style.display = 'none'; document.getElementById('2407.05238v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The source code and pre-trained models are available at https://github.com/haooozi/P2P</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.05083">arXiv:2407.05083</a> <span> [<a href="https://arxiv.org/pdf/2407.05083">pdf</a>, <a href="https://arxiv.org/format/2407.05083">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> </div> </div> <p class="title is-5 mathjax"> Exploring agent interaction patterns in the comment sections of fake and real news </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+K">Kailun Zhu</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+S">Songtao Peng</a>, <a href="/search/cs?searchtype=author&query=Nie%2C+J">Jiaqi Nie</a>, <a href="/search/cs?searchtype=author&query=Ruan%2C+Z">Zhongyuan Ruan</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+S">Shanqing Yu</a>, <a href="/search/cs?searchtype=author&query=Xuan%2C+Q">Qi Xuan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.05083v2-abstract-short" style="display: inline;"> User comments on social media have been recognized as a crucial factor in distinguishing between fake and real news, with many studies focusing on the textual content of user reactions. However, the interactions among agents in the comment sections for fake and real news have not been fully explored. In this study, we analyze a dataset comprising both fake and real news from Reddit to investigate… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.05083v2-abstract-full').style.display = 'inline'; document.getElementById('2407.05083v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.05083v2-abstract-full" style="display: none;"> User comments on social media have been recognized as a crucial factor in distinguishing between fake and real news, with many studies focusing on the textual content of user reactions. However, the interactions among agents in the comment sections for fake and real news have not been fully explored. In this study, we analyze a dataset comprising both fake and real news from Reddit to investigate agent interaction patterns, considering both the network structure and the sentiment of the nodes. Our findings reveal that (i) comments on fake news are more likely to form groups, (ii) compared to fake news, where users generate more negative sentiment, real news tend to elicit more neutral and positive sentiments. Additionally, nodes with similar sentiments cluster together more tightly than anticipated. From a dynamic perspective, we found that the sentiment distribution among nodes stabilizes early and remains stable over time. These findings have both theoretical and practical implications, particularly for the early detection of real and fake news within social networks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.05083v2-abstract-full').style.display = 'none'; document.getElementById('2407.05083v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.03040">arXiv:2407.03040</a> <span> [<a href="https://arxiv.org/pdf/2407.03040">pdf</a>, <a href="https://arxiv.org/format/2407.03040">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Raw Text is All you Need: Knowledge-intensive Multi-turn Instruction Tuning for Large Language Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hou%2C+X">Xia Hou</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Q">Qifeng Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jian Yang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+T">Tongliang Li</a>, <a href="/search/cs?searchtype=author&query=Chai%2C+L">Linzheng Chai</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+X">Xianjie Wu</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+H">Hangyuan Ji</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhoujun Li</a>, <a href="/search/cs?searchtype=author&query=Nie%2C+J">Jixuan Nie</a>, <a href="/search/cs?searchtype=author&query=Dun%2C+J">Jingbo Dun</a>, <a href="/search/cs?searchtype=author&query=Song%2C+W">Wenfeng Song</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.03040v1-abstract-short" style="display: inline;"> Instruction tuning as an effective technique aligns the outputs of large language models (LLMs) with human preference. But how to generate the seasonal multi-turn dialogues from raw documents for instruction tuning still requires further exploration. In this paper, we present a novel framework named R2S that leverages the CoD-Chain of Dialogue logic to guide large language models (LLMs) in generat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.03040v1-abstract-full').style.display = 'inline'; document.getElementById('2407.03040v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.03040v1-abstract-full" style="display: none;"> Instruction tuning as an effective technique aligns the outputs of large language models (LLMs) with human preference. But how to generate the seasonal multi-turn dialogues from raw documents for instruction tuning still requires further exploration. In this paper, we present a novel framework named R2S that leverages the CoD-Chain of Dialogue logic to guide large language models (LLMs) in generating knowledge-intensive multi-turn dialogues for instruction tuning. By integrating raw documents from both open-source datasets and domain-specific web-crawled documents into a benchmark K-BENCH, we cover diverse areas such as Wikipedia (English), Science (Chinese), and Artifacts (Chinese). Our approach first decides the logic flow of the current dialogue and then prompts LLMs to produce key phrases for sourcing relevant response content. This methodology enables the creation of the G I NSTRUCT instruction dataset, retaining raw document knowledge within dialoguestyle interactions. Utilizing this dataset, we fine-tune GLLM, a model designed to transform raw documents into structured multi-turn dialogues, thereby injecting comprehensive domain knowledge into the SFT model for enhanced instruction tuning. This work signifies a stride towards refining the adaptability and effectiveness of LLMs in processing and generating more accurate, contextually nuanced responses across various fields. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.03040v1-abstract-full').style.display = 'none'; document.getElementById('2407.03040v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 3 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68T50 <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.7 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.02719">arXiv:2407.02719</a> <span> [<a href="https://arxiv.org/pdf/2407.02719">pdf</a>, <a href="https://arxiv.org/format/2407.02719">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Boosting Biomedical Concept Extraction by Rule-Based Data Augmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shao%2C+Q">Qiwei Shao</a>, <a href="/search/cs?searchtype=author&query=Mo%2C+F">Fengran Mo</a>, <a href="/search/cs?searchtype=author&query=Nie%2C+J">Jian-Yun Nie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.02719v1-abstract-short" style="display: inline;"> Document-level biomedical concept extraction is the task of identifying biomedical concepts mentioned in a given document. Recent advancements have adapted pre-trained language models for this task. However, the scarcity of domain-specific data and the deviation of concepts from their canonical names often hinder these models' effectiveness. To tackle this issue, we employ MetaMapLite, an existing… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.02719v1-abstract-full').style.display = 'inline'; document.getElementById('2407.02719v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.02719v1-abstract-full" style="display: none;"> Document-level biomedical concept extraction is the task of identifying biomedical concepts mentioned in a given document. Recent advancements have adapted pre-trained language models for this task. However, the scarcity of domain-specific data and the deviation of concepts from their canonical names often hinder these models' effectiveness. To tackle this issue, we employ MetaMapLite, an existing rule-based concept mapping system, to generate additional pseudo-annotated data from PubMed and PMC. The annotated data are used to augment the limited training data. Through extensive experiments, this study demonstrates the utility of a manually crafted concept mapping tool for training a better concept extraction model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.02719v1-abstract-full').style.display = 'none'; document.getElementById('2407.02719v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.18868">arXiv:2406.18868</a> <span> [<a href="https://arxiv.org/pdf/2406.18868">pdf</a>, <a href="https://arxiv.org/format/2406.18868">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Advancing Cross-domain Discriminability in Continual Learning of Vision-Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yicheng Xu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yuxin Chen</a>, <a href="/search/cs?searchtype=author&query=Nie%2C+J">Jiahao Nie</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yusong Wang</a>, <a href="/search/cs?searchtype=author&query=Zhuang%2C+H">Huiping Zhuang</a>, <a href="/search/cs?searchtype=author&query=Okumura%2C+M">Manabu Okumura</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.18868v4-abstract-short" style="display: inline;"> Continual learning (CL) with Vision-Language Models (VLMs) has overcome the constraints of traditional CL, which only focuses on previously encountered classes. During the CL of VLMs, we need not only to prevent the catastrophic forgetting on incrementally learned knowledge but also to preserve the zero-shot ability of VLMs. However, existing methods require additional reference datasets to mainta… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.18868v4-abstract-full').style.display = 'inline'; document.getElementById('2406.18868v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.18868v4-abstract-full" style="display: none;"> Continual learning (CL) with Vision-Language Models (VLMs) has overcome the constraints of traditional CL, which only focuses on previously encountered classes. During the CL of VLMs, we need not only to prevent the catastrophic forgetting on incrementally learned knowledge but also to preserve the zero-shot ability of VLMs. However, existing methods require additional reference datasets to maintain such zero-shot ability and rely on domain-identity hints to classify images across different domains. In this study, we propose Regression-based Analytic Incremental Learning (RAIL), which utilizes a recursive ridge regression-based adapter to learn from a sequence of domains in a non-forgetting manner and decouple the cross-domain correlations by projecting features to a higher-dimensional space. Cooperating with a training-free fusion module, RAIL absolutely preserves the VLM's zero-shot ability on unseen domains without any reference data. Additionally, we introduce Cross-domain Task-Agnostic Incremental Learning (X-TAIL) setting. In this setting, a CL learner is required to incrementally learn from multiple domains and classify test images from both seen and unseen domains without any domain-identity hint. We theoretically prove RAIL's absolute memorization on incrementally learned domains. Experiment results affirm RAIL's state-of-the-art performance in both X-TAIL and existing Multi-domain Task-Incremental Learning settings. The code is released at https://github.com/linghan1997/Regression-based-Analytic-Incremental-Learning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.18868v4-abstract-full').style.display = 'none'; document.getElementById('2406.18868v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.13996">arXiv:2406.13996</a> <span> [<a href="https://arxiv.org/pdf/2406.13996">pdf</a>, <a href="https://arxiv.org/ps/2406.13996">ps</a>, <a href="https://arxiv.org/format/2406.13996">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3637528.3671840">10.1145/3637528.3671840 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Unifying Graph Convolution and Contrastive Learning in Collaborative Filtering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yihong Wu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Le Zhang</a>, <a href="/search/cs?searchtype=author&query=Mo%2C+F">Fengran Mo</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+T">Tianyu Zhu</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+W">Weizhi Ma</a>, <a href="/search/cs?searchtype=author&query=Nie%2C+J">Jian-Yun Nie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.13996v2-abstract-short" style="display: inline;"> Graph-based models and contrastive learning have emerged as prominent methods in Collaborative Filtering (CF). While many existing models in CF incorporate these methods in their design, there seems to be a limited depth of analysis regarding the foundational principles behind them. This paper bridges graph convolution, a pivotal element of graph-based models, with contrastive learning through a t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.13996v2-abstract-full').style.display = 'inline'; document.getElementById('2406.13996v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.13996v2-abstract-full" style="display: none;"> Graph-based models and contrastive learning have emerged as prominent methods in Collaborative Filtering (CF). While many existing models in CF incorporate these methods in their design, there seems to be a limited depth of analysis regarding the foundational principles behind them. This paper bridges graph convolution, a pivotal element of graph-based models, with contrastive learning through a theoretical framework. By examining the learning dynamics and equilibrium of the contrastive loss, we offer a fresh lens to understand contrastive learning via graph theory, emphasizing its capability to capture high-order connectivity. Building on this analysis, we further show that the graph convolutional layers often used in graph-based models are not essential for high-order connectivity modeling and might contribute to the risk of oversmoothing. Stemming from our findings, we introduce Simple Contrastive Collaborative Filtering (SCCF), a simple and effective algorithm based on a naive embedding model and a modified contrastive loss. The efficacy of the algorithm is demonstrated through extensive experiments across four public datasets. The experiment code is available at \url{https://github.com/wu1hong/SCCF}. \end{abstract} <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.13996v2-abstract-full').style.display = 'none'; document.getElementById('2406.13996v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">KDD 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.12718">arXiv:2406.12718</a> <span> [<a href="https://arxiv.org/pdf/2406.12718">pdf</a>, <a href="https://arxiv.org/format/2406.12718">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> AGLA: Mitigating Object Hallucinations in Large Vision-Language Models with Assembly of Global and Local Attention </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=An%2C+W">Wenbin An</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+F">Feng Tian</a>, <a href="/search/cs?searchtype=author&query=Leng%2C+S">Sicong Leng</a>, <a href="/search/cs?searchtype=author&query=Nie%2C+J">Jiahao Nie</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H">Haonan Lin</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">QianYing Wang</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+G">Guang Dai</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+P">Ping Chen</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+S">Shijian Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.12718v2-abstract-short" style="display: inline;"> Despite their great success across various multimodal tasks, Large Vision-Language Models (LVLMs) are facing a prevalent problem with object hallucinations, where the generated textual responses are inconsistent with ground-truth objects in the given image. This paper investigates various LVLMs and pinpoints attention deficiency toward discriminative local image features as one root cause of objec… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.12718v2-abstract-full').style.display = 'inline'; document.getElementById('2406.12718v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.12718v2-abstract-full" style="display: none;"> Despite their great success across various multimodal tasks, Large Vision-Language Models (LVLMs) are facing a prevalent problem with object hallucinations, where the generated textual responses are inconsistent with ground-truth objects in the given image. This paper investigates various LVLMs and pinpoints attention deficiency toward discriminative local image features as one root cause of object hallucinations. Specifically, LVLMs predominantly attend to prompt-independent global image features, while failing to capture prompt-relevant local features, consequently undermining the visual grounding capacity of LVLMs and leading to hallucinations. To this end, we propose Assembly of Global and Local Attention (AGLA), a training-free and plug-and-play approach that mitigates object hallucinations by exploring an ensemble of global features for response generation and local features for visual discrimination simultaneously. Our approach exhibits an image-prompt matching scheme that captures prompt-relevant local features from images, leading to an augmented view of the input image where prompt-relevant content is reserved while irrelevant distractions are masked. With the augmented view, a calibrated decoding distribution can be derived by integrating generative global features from the original image and discriminative local features from the augmented image. Extensive experiments show that AGLA consistently mitigates object hallucinations and enhances general perception capability for LVLMs across various discriminative and generative benchmarks. Our code will be released at https://github.com/Lackel/AGLA. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.12718v2-abstract-full').style.display = 'none'; document.getElementById('2406.12718v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.09121">arXiv:2406.09121</a> <span> [<a href="https://arxiv.org/pdf/2406.09121">pdf</a>, <a href="https://arxiv.org/format/2406.09121">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MMRel: A Relation Understanding Benchmark in the MLLM Era </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Nie%2C+J">Jiahao Nie</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+G">Gongjie Zhang</a>, <a href="/search/cs?searchtype=author&query=An%2C+W">Wenbin An</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+Y">Yap-Peng Tan</a>, <a href="/search/cs?searchtype=author&query=Kot%2C+A+C">Alex C. Kot</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+S">Shijian Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.09121v2-abstract-short" style="display: inline;"> Though Multi-modal Large Language Models (MLLMs) have recently achieved significant progress, they often face various problems while handling inter-object relations, i.e., the interaction or association among distinct objects. This constraint largely stems from insufficient training and evaluation data for relation understanding, which has greatly impeded MLLMs in various vision-language generatio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.09121v2-abstract-full').style.display = 'inline'; document.getElementById('2406.09121v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.09121v2-abstract-full" style="display: none;"> Though Multi-modal Large Language Models (MLLMs) have recently achieved significant progress, they often face various problems while handling inter-object relations, i.e., the interaction or association among distinct objects. This constraint largely stems from insufficient training and evaluation data for relation understanding, which has greatly impeded MLLMs in various vision-language generation and reasoning tasks. We attempt to address this challenge by introducing Multi-Modal Relation Understanding (MMRel), a benchmark that features large-scale, high-quality, and diverse data on inter-object relations. MMRel features three distinctive attributes: (i) It contains over 22K question-answer pairs, spanning three distinct domains and covering three relation categories, ensuring both scale and diversity; (ii) it provides manually verified, high-quality labels to ensure exceptional annotation accuracy; (iii) it includes adversarial cases with highly unusual relations, offering a challenging setting for evaluating relation hallucination. These features make MMRel ideal for evaluating MLLMs on relation understanding, as well as for fine-tuning MLLMs to enhance relation comprehension capability. Extensive experiments verify the effectiveness of MMRel in evaluating and enhancing MLLMs' relation understanding capabilities. The benchmark has been released publicly at: https://niejiahao1998.github.io/MMRel/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.09121v2-abstract-full').style.display = 'none'; document.getElementById('2406.09121v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.05013">arXiv:2406.05013</a> <span> [<a href="https://arxiv.org/pdf/2406.05013">pdf</a>, <a href="https://arxiv.org/format/2406.05013">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> CHIQ: Contextual History Enhancement for Improving Query Rewriting in Conversational Search </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mo%2C+F">Fengran Mo</a>, <a href="/search/cs?searchtype=author&query=Ghaddar%2C+A">Abbas Ghaddar</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+K">Kelong Mao</a>, <a href="/search/cs?searchtype=author&query=Rezagholizadeh%2C+M">Mehdi Rezagholizadeh</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+B">Boxing Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qun Liu</a>, <a href="/search/cs?searchtype=author&query=Nie%2C+J">Jian-Yun Nie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.05013v2-abstract-short" style="display: inline;"> In this paper, we study how open-source large language models (LLMs) can be effectively deployed for improving query rewriting in conversational search, especially for ambiguous queries. We introduce CHIQ, a two-step method that leverages the capabilities of LLMs to resolve ambiguities in the conversation history before query rewriting. This approach contrasts with prior studies that predominantly… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.05013v2-abstract-full').style.display = 'inline'; document.getElementById('2406.05013v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.05013v2-abstract-full" style="display: none;"> In this paper, we study how open-source large language models (LLMs) can be effectively deployed for improving query rewriting in conversational search, especially for ambiguous queries. We introduce CHIQ, a two-step method that leverages the capabilities of LLMs to resolve ambiguities in the conversation history before query rewriting. This approach contrasts with prior studies that predominantly use closed-source LLMs to directly generate search queries from conversation history. We demonstrate on five well-established benchmarks that CHIQ leads to state-of-the-art results across most settings, showing highly competitive performances with systems leveraging closed-source LLMs. Our study provides a first step towards leveraging open-source LLMs in conversational search, as a competitive alternative to the prevailing reliance on commercial LLMs. Data, models, and source code will be publicly available upon acceptance at https://github.com/fengranMark/CHIQ. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.05013v2-abstract-full').style.display = 'none'; document.getElementById('2406.05013v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by EMNLP 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.03249">arXiv:2406.03249</a> <span> [<a href="https://arxiv.org/pdf/2406.03249">pdf</a>, <a href="https://arxiv.org/format/2406.03249">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Near-field Beam training for Extremely Large-scale MIMO Based on Deep Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Nie%2C+J">Jiali Nie</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+Y">Yuanhao Cui</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhaohui Yang</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+W">Weijie Yuan</a>, <a href="/search/cs?searchtype=author&query=Jing%2C+X">Xiaojun Jing</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.03249v2-abstract-short" style="display: inline;"> Extremely Large-scale Array (ELAA) is considered a frontier technology for future communication systems, pivotal in improving wireless systems' rate and spectral efficiency. As ELAA employs a multitude of antennas operating at higher frequencies, users are typically situated in the near-field region where the spherical wavefront propagates. The near-field beam training in ELAA requires both angle… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.03249v2-abstract-full').style.display = 'inline'; document.getElementById('2406.03249v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.03249v2-abstract-full" style="display: none;"> Extremely Large-scale Array (ELAA) is considered a frontier technology for future communication systems, pivotal in improving wireless systems' rate and spectral efficiency. As ELAA employs a multitude of antennas operating at higher frequencies, users are typically situated in the near-field region where the spherical wavefront propagates. The near-field beam training in ELAA requires both angle and distance information, which inevitably leads to a significant increase in the beam training overhead. To address this problem, we propose a near-field beam training method based on deep learning. We use a convolutional neural network (CNN) to efficiently learn channel characteristics from historical data by strategically selecting padding and kernel sizes. The negative value of the user average achievable rate is utilized as the loss function to optimize the beamformer. This method maximizes multi-user networks' achievable rate without predefined beam codebooks. Upon deployment, the model requires solely the pre-estimated channel state information (CSI) to derive the optimal beamforming vector. The simulation results demonstrate that the proposed scheme achieves a more stable beamforming gain and significantly improves performance compared to the traditional beam training method. Furthermore, owing to the inherent traits of deep learning methodologies, this approach substantially diminishes the near-field beam training overhead. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.03249v2-abstract-full').style.display = 'none'; document.getElementById('2406.03249v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.16671">arXiv:2405.16671</a> <span> [<a href="https://arxiv.org/pdf/2405.16671">pdf</a>, <a href="https://arxiv.org/format/2405.16671">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Mixture of Latent Experts Using Tensor Products </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Su%2C+Z">Zhan Su</a>, <a href="/search/cs?searchtype=author&query=Mo%2C+F">Fengran Mo</a>, <a href="/search/cs?searchtype=author&query=Tiwari%2C+P">Prayag Tiwari</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Benyou Wang</a>, <a href="/search/cs?searchtype=author&query=Nie%2C+J">Jian-Yun Nie</a>, <a href="/search/cs?searchtype=author&query=Simonsen%2C+J+G">Jakob Grue Simonsen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.16671v2-abstract-short" style="display: inline;"> In multi-task learning, the conventional approach involves training a model on multiple tasks simultaneously. However, the training signals from different tasks can interfere with one another, potentially leading to \textit{negative transfer}. To mitigate this, we investigate if modular language models can facilitate positive transfer and systematic generalization. Specifically, we propose a novel… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.16671v2-abstract-full').style.display = 'inline'; document.getElementById('2405.16671v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.16671v2-abstract-full" style="display: none;"> In multi-task learning, the conventional approach involves training a model on multiple tasks simultaneously. However, the training signals from different tasks can interfere with one another, potentially leading to \textit{negative transfer}. To mitigate this, we investigate if modular language models can facilitate positive transfer and systematic generalization. Specifically, we propose a novel modular language model (\texttt{TensorPoly}), that balances parameter efficiency with nuanced routing methods. For \textit{modules}, we reparameterize Low-Rank Adaptation (\texttt{LoRA}) by employing an entangled tensor through the use of tensor product operations and name the resulting approach \texttt{TLoRA}. For \textit{routing function}, we tailor two innovative routing functions according to the granularity: \texttt{TensorPoly-I} which directs to each rank within the entangled tensor while \texttt{TensorPoly-II} offers a finer-grained routing approach targeting each order of the entangled tensor. The experimental results from the multi-task T0-benchmark demonstrate that: 1) all modular LMs surpass the corresponding dense approaches, highlighting the potential of modular language models to mitigate negative inference in multi-task learning and deliver superior outcomes. 2) \texttt{TensorPoly-I} achieves higher parameter efficiency in adaptation and outperforms other modular LMs, which shows the potential of our approach in multi-task transfer learning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.16671v2-abstract-full').style.display = 'none'; document.getElementById('2405.16671v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">https://github.com/microsoft/mttl/tree/zs_code</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.15829">arXiv:2405.15829</a> <span> [<a href="https://arxiv.org/pdf/2405.15829">pdf</a>, <a href="https://arxiv.org/format/2405.15829">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Spatio-temporal Value Semantics-based Abstraction for Dense Deep Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Nie%2C+J">Jihui Nie</a>, <a href="/search/cs?searchtype=author&query=Du%2C+D">Dehui Du</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+J">Jiangnan Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.15829v1-abstract-short" style="display: inline;"> Intelligent Cyber-Physical Systems (ICPS) represent a specialized form of Cyber-Physical System (CPS) that incorporates intelligent components, notably Convolutional Neural Networks (CNNs) and Deep Reinforcement Learning (DRL), to undertake multifaceted tasks encompassing perception, decision-making, and control. The utilization of DRL for decision-making facilitates dynamic interaction with the e… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.15829v1-abstract-full').style.display = 'inline'; document.getElementById('2405.15829v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.15829v1-abstract-full" style="display: none;"> Intelligent Cyber-Physical Systems (ICPS) represent a specialized form of Cyber-Physical System (CPS) that incorporates intelligent components, notably Convolutional Neural Networks (CNNs) and Deep Reinforcement Learning (DRL), to undertake multifaceted tasks encompassing perception, decision-making, and control. The utilization of DRL for decision-making facilitates dynamic interaction with the environment, generating control actions aimed at maximizing cumulative rewards. Nevertheless, the inherent uncertainty of the operational environment and the intricate nature of ICPS necessitate exploration within complex and dynamic state spaces during the learning phase. DRL confronts challenges in terms of efficiency, generalization capabilities, and data scarcity during decision-making process. In response to these challenges, we propose an innovative abstract modeling approach grounded in spatial-temporal value semantics, capturing the evolution in the distribution of semantic value across time and space. A semantics-based abstraction is introduced to construct an abstract Markov Decision Process (MDP) for the DRL learning process. Furthermore, optimization techniques for abstraction are delineated, aiming to refine the abstract model and mitigate semantic gaps between abstract and concrete states. The efficacy of the abstract modeling is assessed through the evaluation and analysis of the abstract MDP model using PRISM. A series of experiments are conducted, involving diverse scenarios such as lane-keeping, adaptive cruise control, and intersection crossroad assistance, to demonstrate the effectiveness of our abstracting approach. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.15829v1-abstract-full').style.display = 'none'; document.getElementById('2405.15829v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">24 pages, 7 figures, conference</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68N30 <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> D.2.4 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.13325">arXiv:2405.13325</a> <span> [<a href="https://arxiv.org/pdf/2405.13325">pdf</a>, <a href="https://arxiv.org/format/2405.13325">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> DEGAP: Dual Event-Guided Adaptive Prefixes for Templated-Based Event Argument Extraction with Slot Querying </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guanghui Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+D">Dexi Liu</a>, <a href="/search/cs?searchtype=author&query=Nie%2C+J">Jian-Yun Nie</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+Q">Qizhi Wan</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+R">Rong Hu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiping Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+W">Wanlong Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jiaming Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.13325v2-abstract-short" style="display: inline;"> Recent advancements in event argument extraction (EAE) involve incorporating useful auxiliary information into models during training and inference, such as retrieved instances and event templates. These methods face two challenges: (1) the retrieval results may be irrelevant and (2) templates are developed independently for each event without considering their possible relationship. In this work,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.13325v2-abstract-full').style.display = 'inline'; document.getElementById('2405.13325v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.13325v2-abstract-full" style="display: none;"> Recent advancements in event argument extraction (EAE) involve incorporating useful auxiliary information into models during training and inference, such as retrieved instances and event templates. These methods face two challenges: (1) the retrieval results may be irrelevant and (2) templates are developed independently for each event without considering their possible relationship. In this work, we propose DEGAP to address these challenges through a simple yet effective components: dual prefixes, i.e. learnable prompt vectors, where the instance-oriented prefix and template-oriented prefix are trained to learn information from different event instances and templates. Additionally, we propose an event-guided adaptive gating mechanism, which can adaptively leverage possible connections between different events and thus capture relevant information from the prefix. Finally, these event-guided prefixes provide relevant information as cues to EAE model without retrieval. Extensive experiments demonstrate that our method achieves new state-of-the-art performance on four datasets (ACE05, RAMS, WIKIEVENTS, and MLEE). Further analysis shows the impact of different components. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.13325v2-abstract-full').style.display = 'none'; document.getElementById('2405.13325v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.10936">arXiv:2405.10936</a> <span> [<a href="https://arxiv.org/pdf/2405.10936">pdf</a>, <a href="https://arxiv.org/format/2405.10936">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> A Survey on Large Language Models with Multilingualism: Recent Advances and New Frontiers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+K">Kaiyu Huang</a>, <a href="/search/cs?searchtype=author&query=Mo%2C+F">Fengran Mo</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xinyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Hongliang Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">You Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yuanchi Zhang</a>, <a href="/search/cs?searchtype=author&query=Yi%2C+W">Weijian Yi</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+Y">Yulong Mao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jinchen Liu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yuzhuang Xu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jinan Xu</a>, <a href="/search/cs?searchtype=author&query=Nie%2C+J">Jian-Yun Nie</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yang Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.10936v2-abstract-short" style="display: inline;"> The rapid development of Large Language Models (LLMs) demonstrates remarkable multilingual capabilities in natural language processing, attracting global attention in both academia and industry. To mitigate potential discrimination and enhance the overall usability and accessibility for diverse language user groups, it is important for the development of language-fair technology. Despite the break… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.10936v2-abstract-full').style.display = 'inline'; document.getElementById('2405.10936v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.10936v2-abstract-full" style="display: none;"> The rapid development of Large Language Models (LLMs) demonstrates remarkable multilingual capabilities in natural language processing, attracting global attention in both academia and industry. To mitigate potential discrimination and enhance the overall usability and accessibility for diverse language user groups, it is important for the development of language-fair technology. Despite the breakthroughs of LLMs, the investigation into the multilingual scenario remains insufficient, where a comprehensive survey to summarize recent approaches, developments, limitations, and potential solutions is desirable. To this end, we provide a survey with multiple perspectives on the utilization of LLMs in the multilingual scenario. We first rethink the transitions between previous and current research on pre-trained language models. Then we introduce several perspectives on the multilingualism of LLMs, including training and inference methods, information retrieval, model security, multi-domain with language culture, and usage of datasets. We also discuss the major challenges that arise in these aspects, along with possible solutions. Besides, we highlight future research directions that aim at further enhancing LLMs with multilingualism. The survey aims to help the research community address multilingual problems and provide a comprehensive understanding of the core concepts, key techniques, and latest developments in multilingual natural language processing based on LLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.10936v2-abstract-full').style.display = 'none'; document.getElementById('2405.10936v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">65 pages, Work in Progress</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.09487">arXiv:2405.09487</a> <span> [<a href="https://arxiv.org/pdf/2405.09487">pdf</a>, <a href="https://arxiv.org/format/2405.09487">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Color Space Learning for Cross-Color Person Re-Identification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Nie%2C+J">Jiahao Nie</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+S">Shan Lin</a>, <a href="/search/cs?searchtype=author&query=Kot%2C+A+C">Alex C. Kot</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.09487v1-abstract-short" style="display: inline;"> The primary color profile of the same identity is assumed to remain consistent in typical Person Re-identification (Person ReID) tasks. However, this assumption may be invalid in real-world situations and images hold variant color profiles, because of cross-modality cameras or identity with different clothing. To address this issue, we propose Color Space Learning (CSL) for those Cross-Color Perso… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.09487v1-abstract-full').style.display = 'inline'; document.getElementById('2405.09487v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.09487v1-abstract-full" style="display: none;"> The primary color profile of the same identity is assumed to remain consistent in typical Person Re-identification (Person ReID) tasks. However, this assumption may be invalid in real-world situations and images hold variant color profiles, because of cross-modality cameras or identity with different clothing. To address this issue, we propose Color Space Learning (CSL) for those Cross-Color Person ReID problems. Specifically, CSL guides the model to be less color-sensitive with two modules: Image-level Color-Augmentation and Pixel-level Color-Transformation. The first module increases the color diversity of the inputs and guides the model to focus more on the non-color information. The second module projects every pixel of input images onto a new color space. In addition, we introduce a new Person ReID benchmark across RGB and Infrared modalities, NTU-Corridor, which is the first with privacy agreements from all participants. To evaluate the effectiveness and robustness of our proposed CSL, we evaluate it on several Cross-Color Person ReID benchmarks. Our method surpasses the state-of-the-art methods consistently. The code and benchmark are available at: https://github.com/niejiahao1998/CSL <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.09487v1-abstract-full').style.display = 'none'; document.getElementById('2405.09487v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICME 2024 (Oral)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.03228">arXiv:2405.03228</a> <span> [<a href="https://arxiv.org/pdf/2405.03228">pdf</a>, <a href="https://arxiv.org/format/2405.03228">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> TED: Accelerate Model Training by Internal Generalization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiao%2C+J">Jinying Xiao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+P">Ping Li</a>, <a href="/search/cs?searchtype=author&query=Nie%2C+J">Jie Nie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.03228v2-abstract-short" style="display: inline;"> Large language models have demonstrated strong performance in recent years, but the high cost of training drives the need for efficient methods to compress dataset sizes. We propose TED pruning, a method that addresses the challenge of overfitting under high pruning ratios by quantifying the model's ability to improve performance on pruned data while fitting retained data, known as Internal Genera… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.03228v2-abstract-full').style.display = 'inline'; document.getElementById('2405.03228v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.03228v2-abstract-full" style="display: none;"> Large language models have demonstrated strong performance in recent years, but the high cost of training drives the need for efficient methods to compress dataset sizes. We propose TED pruning, a method that addresses the challenge of overfitting under high pruning ratios by quantifying the model's ability to improve performance on pruned data while fitting retained data, known as Internal Generalization (IG). TED uses an optimization objective based on Internal Generalization Distance (IGD), measuring changes in IG before and after pruning to align with true generalization performance and achieve implicit regularization. The IGD optimization objective was verified to allow the model to achieve the smallest upper bound on generalization error. The impact of small mask fluctuations on IG is studied through masks and Taylor approximation, and fast estimation of IGD is enabled. In analyzing continuous training dynamics, the prior effect of IGD is validated, and a progressive pruning strategy is proposed. Experiments on image classification, natural language understanding, and large language model fine-tuning show TED achieves lossless performance with 60-70\% of the data. Upon acceptance, our code will be made publicly available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.03228v2-abstract-full').style.display = 'none'; document.getElementById('2405.03228v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.17199">arXiv:2404.17199</a> <span> [<a href="https://arxiv.org/pdf/2404.17199">pdf</a>, <a href="https://arxiv.org/format/2404.17199">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Few-shot Calligraphy Style Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+F">Fangda Chen</a>, <a href="/search/cs?searchtype=author&query=Nie%2C+J">Jiacheng Nie</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+L">Lichuan Jiang</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+Z">Zhuoer Zeng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.17199v1-abstract-short" style="display: inline;"> We introduced "Presidifussion," a novel approach to learning and replicating the unique style of calligraphy of President Xu, using a pretrained diffusion model adapted through a two-stage training process. Initially, our model is pretrained on a diverse dataset containing works from various calligraphers. This is followed by fine-tuning on a smaller, specialized dataset of President Xu's calligra… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.17199v1-abstract-full').style.display = 'inline'; document.getElementById('2404.17199v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.17199v1-abstract-full" style="display: none;"> We introduced "Presidifussion," a novel approach to learning and replicating the unique style of calligraphy of President Xu, using a pretrained diffusion model adapted through a two-stage training process. Initially, our model is pretrained on a diverse dataset containing works from various calligraphers. This is followed by fine-tuning on a smaller, specialized dataset of President Xu's calligraphy, comprising just under 200 images. Our method introduces innovative techniques of font image conditioning and stroke information conditioning, enabling the model to capture the intricate structural elements of Chinese characters. The effectiveness of our approach is demonstrated through a comparison with traditional methods like zi2zi and CalliGAN, with our model achieving comparable performance using significantly smaller datasets and reduced computational resources. This work not only presents a breakthrough in the digital preservation of calligraphic art but also sets a new standard for data-efficient generative modeling in the domain of cultural heritage digitization. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.17199v1-abstract-full').style.display = 'none'; document.getElementById('2404.17199v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.13940">arXiv:2404.13940</a> <span> [<a href="https://arxiv.org/pdf/2404.13940">pdf</a>, <a href="https://arxiv.org/format/2404.13940">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> A User-Centric Multi-Intent Benchmark for Evaluating Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jiayin Wang</a>, <a href="/search/cs?searchtype=author&query=Mo%2C+F">Fengran Mo</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+W">Weizhi Ma</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+P">Peijie Sun</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Min Zhang</a>, <a href="/search/cs?searchtype=author&query=Nie%2C+J">Jian-Yun Nie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.13940v3-abstract-short" style="display: inline;"> Large language models (LLMs) are essential tools that users employ across various scenarios, so evaluating their performance and guiding users in selecting the suitable service is important. Although many benchmarks exist, they mainly focus on specific predefined model abilities, such as world knowledge, reasoning, etc. Based on these ability scores, it is hard for users to determine which LLM bes… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.13940v3-abstract-full').style.display = 'inline'; document.getElementById('2404.13940v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.13940v3-abstract-full" style="display: none;"> Large language models (LLMs) are essential tools that users employ across various scenarios, so evaluating their performance and guiding users in selecting the suitable service is important. Although many benchmarks exist, they mainly focus on specific predefined model abilities, such as world knowledge, reasoning, etc. Based on these ability scores, it is hard for users to determine which LLM best suits their particular needs. To address these issues, we propose to evaluate LLMs from a user-centric perspective and design this benchmark to measure their efficacy in satisfying user needs under distinct intents. Firstly, we collect 1,846 real-world use cases from a user study with 712 participants from 23 countries. This first-hand data helps us understand actual user intents and needs in LLM interactions, forming the User Reported Scenarios (URS) dataset, which is categorized with six types of user intents. Secondly, based on this authentic dataset, we benchmark 10 LLM services with GPT-4-as-Judge. Thirdly, we show that benchmark scores align well with human preference in both real-world experience and pair-wise annotations, achieving Pearson correlations of 0.95 and 0.94, respectively. This alignment confirms that the URS dataset and our evaluation method establish an effective user-centric benchmark. The dataset, code, and process data are available at https://github.com/Alice1998/URS. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.13940v3-abstract-full').style.display = 'none'; document.getElementById('2404.13940v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.09431">arXiv:2404.09431</a> <span> [<a href="https://arxiv.org/pdf/2404.09431">pdf</a>, <a href="https://arxiv.org/format/2404.09431">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> VFMM3D: Releasing the Potential of Image by Vision Foundation Model for Monocular 3D Object Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ding%2C+B">Bonan Ding</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+J">Jin Xie</a>, <a href="/search/cs?searchtype=author&query=Nie%2C+J">Jing Nie</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+J">Jiale Cao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xuelong Li</a>, <a href="/search/cs?searchtype=author&query=Pang%2C+Y">Yanwei Pang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.09431v2-abstract-short" style="display: inline;"> Due to its cost-effectiveness and widespread availability, monocular 3D object detection, which relies solely on a single camera during inference, holds significant importance across various applications, including autonomous driving and robotics. Nevertheless, directly predicting the coordinates of objects in 3D space from monocular images poses challenges. Therefore, an effective solution involv… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.09431v2-abstract-full').style.display = 'inline'; document.getElementById('2404.09431v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.09431v2-abstract-full" style="display: none;"> Due to its cost-effectiveness and widespread availability, monocular 3D object detection, which relies solely on a single camera during inference, holds significant importance across various applications, including autonomous driving and robotics. Nevertheless, directly predicting the coordinates of objects in 3D space from monocular images poses challenges. Therefore, an effective solution involves transforming monocular images into LiDAR-like representations and employing a LiDAR-based 3D object detector to predict the 3D coordinates of objects. The key step in this method is accurately converting the monocular image into a reliable point cloud form. In this paper, we present VFMM3D, an innovative framework that leverages the capabilities of Vision Foundation Models (VFMs) to accurately transform single-view images into LiDAR point cloud representations. VFMM3D utilizes the Segment Anything Model (SAM) and Depth Anything Model (DAM) to generate high-quality pseudo-LiDAR data enriched with rich foreground information. Specifically, the Depth Anything Model (DAM) is employed to generate dense depth maps. Subsequently, the Segment Anything Model (SAM) is utilized to differentiate foreground and background regions by predicting instance masks. These predicted instance masks and depth maps are then combined and projected into 3D space to generate pseudo-LiDAR points. Finally, any object detectors based on point clouds can be utilized to predict the 3D coordinates of objects. Comprehensive experiments are conducted on two challenging 3D object detection datasets, KITTI and Waymo. Our VFMM3D establishes a new state-of-the-art performance on both datasets. Additionally, experimental results demonstrate the generality of VFMM3D, showcasing its seamless integration into various LiDAR-based 3D object detectors. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.09431v2-abstract-full').style.display = 'none'; document.getElementById('2404.09431v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 4 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.01780">arXiv:2404.01780</a> <span> [<a href="https://arxiv.org/pdf/2404.01780">pdf</a>, <a href="https://arxiv.org/format/2404.01780">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Instrumentation and Methods for Astrophysics">astro-ph.IM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Astrophysics of Galaxies">astro-ph.GA</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> CSST Strong Lensing Preparation: a Framework for Detecting Strong Lenses in the Multi-color Imaging Survey by the China Survey Space Telescope (CSST) </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+X">Xu Li</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+R">Ruiqi Sun</a>, <a href="/search/cs?searchtype=author&query=Lv%2C+J">Jiameng Lv</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+P">Peng Jia</a>, <a href="/search/cs?searchtype=author&query=Li%2C+N">Nan Li</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+C">Chengliang Wei</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Z">Zou Hu</a>, <a href="/search/cs?searchtype=author&query=Er%2C+X">Xinzhong Er</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yun Chen</a>, <a href="/search/cs?searchtype=author&query=Ban%2C+Z">Zhang Ban</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Y">Yuedong Fang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Q">Qi Guo</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+D">Dezi Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Guoliang Li</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+L">Lin Lin</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Ming Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+R">Ran Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiaobo Li</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+Y">Yu Luo</a>, <a href="/search/cs?searchtype=author&query=Meng%2C+X">Xianmin Meng</a>, <a href="/search/cs?searchtype=author&query=Nie%2C+J">Jundan Nie</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+Z">Zhaoxiang Qi</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+Y">Yisheng Qiu</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+L">Li Shao</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+H">Hao Tian</a> , et al. (7 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.01780v1-abstract-short" style="display: inline;"> Strong gravitational lensing is a powerful tool for investigating dark matter and dark energy properties. With the advent of large-scale sky surveys, we can discover strong lensing systems on an unprecedented scale, which requires efficient tools to extract them from billions of astronomical objects. The existing mainstream lens-finding tools are based on machine learning algorithms and applied to… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.01780v1-abstract-full').style.display = 'inline'; document.getElementById('2404.01780v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.01780v1-abstract-full" style="display: none;"> Strong gravitational lensing is a powerful tool for investigating dark matter and dark energy properties. With the advent of large-scale sky surveys, we can discover strong lensing systems on an unprecedented scale, which requires efficient tools to extract them from billions of astronomical objects. The existing mainstream lens-finding tools are based on machine learning algorithms and applied to cut-out-centered galaxies. However, according to the design and survey strategy of optical surveys by CSST, preparing cutouts with multiple bands requires considerable efforts. To overcome these challenges, we have developed a framework based on a hierarchical visual Transformer with a sliding window technique to search for strong lensing systems within entire images. Moreover, given that multi-color images of strong lensing systems can provide insights into their physical characteristics, our framework is specifically crafted to identify strong lensing systems in images with any number of channels. As evaluated using CSST mock data based on an Semi-Analytic Model named CosmoDC2, our framework achieves precision and recall rates of 0.98 and 0.90, respectively. To evaluate the effectiveness of our method in real observations, we have applied it to a subset of images from the DESI Legacy Imaging Surveys and media images from Euclid Early Release Observations. 61 new strong lensing system candidates are discovered by our method. However, we also identified false positives arising primarily from the simplified galaxy morphology assumptions within the simulation. This underscores the practical limitations of our approach while simultaneously highlighting potential avenues for future improvements. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.01780v1-abstract-full').style.display = 'none'; document.getElementById('2404.01780v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The paper is accepted by the AJ. The complete code could be downloaded with DOI of: 10.12149/101393. Comments are welcome</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.00611">arXiv:2404.00611</a> <span> [<a href="https://arxiv.org/pdf/2404.00611">pdf</a>, <a href="https://arxiv.org/ps/2404.00611">ps</a>, <a href="https://arxiv.org/format/2404.00611">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Object-level Copy-Move Forgery Image Detection based on Inconsistency Mining </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jingyu Wang</a>, <a href="/search/cs?searchtype=author&query=Jing%2C+N">Niantai Jing</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Ziyao Liu</a>, <a href="/search/cs?searchtype=author&query=Nie%2C+J">Jie Nie</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+Y">Yuxin Qi</a>, <a href="/search/cs?searchtype=author&query=Chi%2C+C">Chi-Hung Chi</a>, <a href="/search/cs?searchtype=author&query=Lam%2C+K">Kwok-Yan Lam</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.00611v2-abstract-short" style="display: inline;"> In copy-move tampering operations, perpetrators often employ techniques, such as blurring, to conceal tampering traces, posing significant challenges to the detection of object-level targets with intact structures. Focus on these challenges, this paper proposes an Object-level Copy-Move Forgery Image Detection based on Inconsistency Mining (IMNet). To obtain complete object-level targets, we custo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.00611v2-abstract-full').style.display = 'inline'; document.getElementById('2404.00611v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.00611v2-abstract-full" style="display: none;"> In copy-move tampering operations, perpetrators often employ techniques, such as blurring, to conceal tampering traces, posing significant challenges to the detection of object-level targets with intact structures. Focus on these challenges, this paper proposes an Object-level Copy-Move Forgery Image Detection based on Inconsistency Mining (IMNet). To obtain complete object-level targets, we customize prototypes for both the source and tampered regions and dynamically update them. Additionally, we extract inconsistent regions between coarse similar regions obtained through self-correlation calculations and regions composed of prototypes. The detected inconsistent regions are used as supplements to coarse similar regions to refine pixel-level detection. We operate experiments on three public datasets which validate the effectiveness and the robustness of the proposed IMNet. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.00611v2-abstract-full').style.display = 'none'; document.getElementById('2404.00611v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">4 pages, 2 figures, Accepted to WWW 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.19983">arXiv:2403.19983</a> <span> [<a href="https://arxiv.org/pdf/2403.19983">pdf</a>, <a href="https://arxiv.org/format/2403.19983">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> A multi-stage semi-supervised learning for ankle fracture classification on CT images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+H">Hongzhi Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Guicheng Li</a>, <a href="/search/cs?searchtype=author&query=Nie%2C+J">Jiacheng Nie</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+H">Hui Tang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+C">Chunfeng Yang</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Q">Qianjin Feng</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+H">Hailin Xu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yang Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.19983v1-abstract-short" style="display: inline;"> Because of the complicated mechanism of ankle injury, it is very difficult to diagnose ankle fracture in clinic. In order to simplify the process of fracture diagnosis, an automatic diagnosis model of ankle fracture was proposed. Firstly, a tibia-fibula segmentation network is proposed for the joint tibiofibular region of the ankle joint, and the corresponding segmentation dataset is established o… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.19983v1-abstract-full').style.display = 'inline'; document.getElementById('2403.19983v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.19983v1-abstract-full" style="display: none;"> Because of the complicated mechanism of ankle injury, it is very difficult to diagnose ankle fracture in clinic. In order to simplify the process of fracture diagnosis, an automatic diagnosis model of ankle fracture was proposed. Firstly, a tibia-fibula segmentation network is proposed for the joint tibiofibular region of the ankle joint, and the corresponding segmentation dataset is established on the basis of fracture data. Secondly, the image registration method is used to register the bone segmentation mask with the normal bone mask. Finally, a semi-supervised classifier is constructed to make full use of a large number of unlabeled data to classify ankle fractures. Experiments show that the proposed method can segment fractures with fracture lines accurately and has better performance than the general method. At the same time, this method is superior to classification network in several indexes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.19983v1-abstract-full').style.display = 'none'; document.getElementById('2403.19983v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.15285">arXiv:2403.15285</a> <span> [<a href="https://arxiv.org/pdf/2403.15285">pdf</a>, <a href="https://arxiv.org/format/2403.15285">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Blockchain-based Pseudonym Management for Vehicle Twin Migrations in Vehicular Edge Metaverse </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kang%2C+J">Jiawen Kang</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+X">Xiaofeng Luo</a>, <a href="/search/cs?searchtype=author&query=Nie%2C+J">Jiangtian Nie</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+T">Tianhao Wu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+H">Haibo Zhou</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yonghua Wang</a>, <a href="/search/cs?searchtype=author&query=Niyato%2C+D">Dusit Niyato</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+S">Shiwen Mao</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+S">Shengli Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.15285v1-abstract-short" style="display: inline;"> Driven by the great advances in metaverse and edge computing technologies, vehicular edge metaverses are expected to disrupt the current paradigm of intelligent transportation systems. As highly computerized avatars of Vehicular Metaverse Users (VMUs), the Vehicle Twins (VTs) deployed in edge servers can provide valuable metaverse services to improve driving safety and on-board satisfaction for th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.15285v1-abstract-full').style.display = 'inline'; document.getElementById('2403.15285v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.15285v1-abstract-full" style="display: none;"> Driven by the great advances in metaverse and edge computing technologies, vehicular edge metaverses are expected to disrupt the current paradigm of intelligent transportation systems. As highly computerized avatars of Vehicular Metaverse Users (VMUs), the Vehicle Twins (VTs) deployed in edge servers can provide valuable metaverse services to improve driving safety and on-board satisfaction for their VMUs throughout journeys. To maintain uninterrupted metaverse experiences, VTs must be migrated among edge servers following the movements of vehicles. This can raise concerns about privacy breaches during the dynamic communications among vehicular edge metaverses. To address these concerns and safeguard location privacy, pseudonyms as temporary identifiers can be leveraged by both VMUs and VTs to realize anonymous communications in the physical space and virtual spaces. However, existing pseudonym management methods fall short in meeting the extensive pseudonym demands in vehicular edge metaverses, thus dramatically diminishing the performance of privacy preservation. To this end, we present a cross-metaverse empowered dual pseudonym management framework. We utilize cross-chain technology to enhance management efficiency and data security for pseudonyms. Furthermore, we propose a metric to assess the privacy level and employ a Multi-Agent Deep Reinforcement Learning (MADRL) approach to obtain an optimal pseudonym generating strategy. Numerical results demonstrate that our proposed schemes are high-efficiency and cost-effective, showcasing their promising applications in vehicular edge metaverses. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.15285v1-abstract-full').style.display = 'none'; document.getElementById('2403.15285v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">14 pages, 9 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.12690">arXiv:2403.12690</a> <span> [<a href="https://arxiv.org/pdf/2403.12690">pdf</a>, <a href="https://arxiv.org/format/2403.12690">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> LNPT: Label-free Network Pruning and Training </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiao%2C+J">Jinying Xiao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+P">Ping Li</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Z">Zhe Tang</a>, <a href="/search/cs?searchtype=author&query=Nie%2C+J">Jie Nie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.12690v2-abstract-short" style="display: inline;"> Pruning before training enables the deployment of neural networks on smart devices. By retaining weights conducive to generalization, pruned networks can be accommodated on resource-constrained smart devices. It is commonly held that the distance on weight norms between the initialized and the fully-trained networks correlates with generalization performance. However, as we have uncovered, inconsi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.12690v2-abstract-full').style.display = 'inline'; document.getElementById('2403.12690v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.12690v2-abstract-full" style="display: none;"> Pruning before training enables the deployment of neural networks on smart devices. By retaining weights conducive to generalization, pruned networks can be accommodated on resource-constrained smart devices. It is commonly held that the distance on weight norms between the initialized and the fully-trained networks correlates with generalization performance. However, as we have uncovered, inconsistency between this metric and generalization during training processes, which poses an obstacle to determine the pruned structures on smart devices in advance. In this paper, we introduce the concept of the learning gap, emphasizing its accurate correlation with generalization. Experiments show that the learning gap, in the form of feature maps from the penultimate layer of networks, aligns with variations of generalization performance. We propose a novel learning framework, LNPT, which enables mature networks on the cloud to provide online guidance for network pruning and learning on smart devices with unlabeled data. Our results demonstrate the superiority of this approach over supervised training. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.12690v2-abstract-full').style.display = 'none'; document.getElementById('2403.12690v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages,7 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.12688">arXiv:2403.12688</a> <span> [<a href="https://arxiv.org/pdf/2403.12688">pdf</a>, <a href="https://arxiv.org/format/2403.12688">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> SEVEN: Pruning Transformer Model by Reserving Sentinels </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiao%2C+J">Jinying Xiao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+P">Ping Li</a>, <a href="/search/cs?searchtype=author&query=Nie%2C+J">Jie Nie</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Z">Zhe Tang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.12688v1-abstract-short" style="display: inline;"> Large-scale Transformer models (TM) have demonstrated outstanding performance across various tasks. However, their considerable parameter size restricts their applicability, particularly on mobile devices. Due to the dynamic and intricate nature of gradients on TM compared to Convolutional Neural Networks, commonly used pruning methods tend to retain weights with larger gradient noise. This result… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.12688v1-abstract-full').style.display = 'inline'; document.getElementById('2403.12688v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.12688v1-abstract-full" style="display: none;"> Large-scale Transformer models (TM) have demonstrated outstanding performance across various tasks. However, their considerable parameter size restricts their applicability, particularly on mobile devices. Due to the dynamic and intricate nature of gradients on TM compared to Convolutional Neural Networks, commonly used pruning methods tend to retain weights with larger gradient noise. This results in pruned models that are sensitive to sparsity and datasets, exhibiting suboptimal performance. Symbolic Descent (SD) is a general approach for training and fine-tuning TM. In this paper, we attempt to describe the noisy batch gradient sequences on TM through the cumulative process of SD. We utilize this design to dynamically assess the importance scores of weights.SEVEN is introduced by us, which particularly favors weights with consistently high sensitivity, i.e., weights with small gradient noise. These weights are tended to be preserved by SEVEN. Extensive experiments on various TM in natural language, question-answering, and image classification domains are conducted to validate the effectiveness of SEVEN. The results demonstrate significant improvements of SEVEN in multiple pruning scenarios and across different sparsity levels. Additionally, SEVEN exhibits robust performance under various fine-tuning strategies. The code is publicly available at https://github.com/xiaojinying/SEVEN. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.12688v1-abstract-full').style.display = 'none'; document.getElementById('2403.12688v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages,6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.11335">arXiv:2403.11335</a> <span> [<a href="https://arxiv.org/pdf/2403.11335">pdf</a>, <a href="https://arxiv.org/format/2403.11335">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> ConvSDG: Session Data Generation for Conversational Search </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mo%2C+F">Fengran Mo</a>, <a href="/search/cs?searchtype=author&query=Yi%2C+B">Bole Yi</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+K">Kelong Mao</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+C">Chen Qu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+K">Kaiyu Huang</a>, <a href="/search/cs?searchtype=author&query=Nie%2C+J">Jian-Yun Nie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.11335v1-abstract-short" style="display: inline;"> Conversational search provides a more convenient interface for users to search by allowing multi-turn interaction with the search engine. However, the effectiveness of the conversational dense retrieval methods is limited by the scarcity of training data required for their fine-tuning. Thus, generating more training conversational sessions with relevant labels could potentially improve search perf… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.11335v1-abstract-full').style.display = 'inline'; document.getElementById('2403.11335v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.11335v1-abstract-full" style="display: none;"> Conversational search provides a more convenient interface for users to search by allowing multi-turn interaction with the search engine. However, the effectiveness of the conversational dense retrieval methods is limited by the scarcity of training data required for their fine-tuning. Thus, generating more training conversational sessions with relevant labels could potentially improve search performance. Based on the promising capabilities of large language models (LLMs) on text generation, we propose ConvSDG, a simple yet effective framework to explore the feasibility of boosting conversational search by using LLM for session data generation. Within this framework, we design dialogue/session-level and query-level data generation with unsupervised and semi-supervised learning, according to the availability of relevance judgments. The generated data are used to fine-tune the conversational dense retriever. Extensive experiments on four widely used datasets demonstrate the effectiveness and broad applicability of our ConvSDG framework compared with several strong baselines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.11335v1-abstract-full').style.display = 'none'; document.getElementById('2403.11335v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by WWW 2024 Workshop</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.10779">arXiv:2403.10779</a> <span> [<a href="https://arxiv.org/pdf/2403.10779">pdf</a>, <a href="https://arxiv.org/format/2403.10779">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> LLM-based Conversational AI Therapist for Daily Functioning Screening and Psychotherapeutic Intervention via Everyday Smart Devices </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Nie%2C+J">Jingping Nie</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+H">Hanya Shao</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+Y">Yuang Fan</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+Q">Qijia Shao</a>, <a href="/search/cs?searchtype=author&query=You%2C+H">Haoxuan You</a>, <a href="/search/cs?searchtype=author&query=Preindl%2C+M">Matthias Preindl</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+X">Xiaofan Jiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.10779v1-abstract-short" style="display: inline;"> Despite the global mental health crisis, access to screenings, professionals, and treatments remains high. In collaboration with licensed psychotherapists, we propose a Conversational AI Therapist with psychotherapeutic Interventions (CaiTI), a platform that leverages large language models (LLM)s and smart devices to enable better mental health self-care. CaiTI can screen the day-to-day functionin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.10779v1-abstract-full').style.display = 'inline'; document.getElementById('2403.10779v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.10779v1-abstract-full" style="display: none;"> Despite the global mental health crisis, access to screenings, professionals, and treatments remains high. In collaboration with licensed psychotherapists, we propose a Conversational AI Therapist with psychotherapeutic Interventions (CaiTI), a platform that leverages large language models (LLM)s and smart devices to enable better mental health self-care. CaiTI can screen the day-to-day functioning using natural and psychotherapeutic conversations. CaiTI leverages reinforcement learning to provide personalized conversation flow. CaiTI can accurately understand and interpret user responses. When the user needs further attention during the conversation, CaiTI can provide conversational psychotherapeutic interventions, including cognitive behavioral therapy (CBT) and motivational interviewing (MI). Leveraging the datasets prepared by the licensed psychotherapists, we experiment and microbenchmark various LLMs' performance in tasks along CaiTI's conversation flow and discuss their strengths and weaknesses. With the psychotherapists, we implement CaiTI and conduct 14-day and 24-week studies. The study results, validated by therapists, demonstrate that CaiTI can converse with users naturally, accurately understand and interpret user responses, and provide psychotherapeutic interventions appropriately and effectively. We showcase the potential of CaiTI LLMs to assist the mental therapy diagnosis and treatment and improve day-to-day functioning screening and precautionary psychotherapeutic intervention systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.10779v1-abstract-full').style.display = 'none'; document.getElementById('2403.10779v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.04066">arXiv:2403.04066</a> <span> [<a href="https://arxiv.org/pdf/2403.04066">pdf</a>, <a href="https://arxiv.org/ps/2403.04066">ps</a>, <a href="https://arxiv.org/format/2403.04066">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> LoDisc: Learning Global-Local Discriminative Features for Self-Supervised Fine-Grained Visual Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shi%2C+J">Jialu Shi</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+Z">Zhiqiang Wei</a>, <a href="/search/cs?searchtype=author&query=Nie%2C+J">Jie Nie</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+L">Lei Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.04066v1-abstract-short" style="display: inline;"> Self-supervised contrastive learning strategy has attracted remarkable attention due to its exceptional ability in representation learning. However, current contrastive learning tends to learn global coarse-grained representations of the image that benefit generic object recognition, whereas such coarse-grained features are insufficient for fine-grained visual recognition. In this paper, we presen… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.04066v1-abstract-full').style.display = 'inline'; document.getElementById('2403.04066v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.04066v1-abstract-full" style="display: none;"> Self-supervised contrastive learning strategy has attracted remarkable attention due to its exceptional ability in representation learning. However, current contrastive learning tends to learn global coarse-grained representations of the image that benefit generic object recognition, whereas such coarse-grained features are insufficient for fine-grained visual recognition. In this paper, we present to incorporate the subtle local fine-grained feature learning into global self-supervised contrastive learning through a pure self-supervised global-local fine-grained contrastive learning framework. Specifically, a novel pretext task called Local Discrimination (LoDisc) is proposed to explicitly supervise self-supervised model's focus towards local pivotal regions which are captured by a simple-but-effective location-wise mask sampling strategy. We show that Local Discrimination pretext task can effectively enhance fine-grained clues in important local regions, and the global-local framework further refines the fine-grained feature representations of images. Extensive experimental results on different fine-grained object recognition tasks demonstrate that the proposed method can lead to a decent improvement in different evaluation settings. Meanwhile, the proposed method is also effective in general object recognition tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.04066v1-abstract-full').style.display = 'none'; document.getElementById('2403.04066v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, submitted</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68U10 <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.4 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.02545">arXiv:2403.02545</a> <span> [<a href="https://arxiv.org/pdf/2403.02545">pdf</a>, <a href="https://arxiv.org/format/2403.02545">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Wukong: Towards a Scaling Law for Large-Scale Recommendation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Buyun Zhang</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+L">Liang Luo</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yuxin Chen</a>, <a href="/search/cs?searchtype=author&query=Nie%2C+J">Jade Nie</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xi Liu</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+D">Daifeng Guo</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yanli Zhao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shen Li</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+Y">Yuchen Hao</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+Y">Yantao Yao</a>, <a href="/search/cs?searchtype=author&query=Lakshminarayanan%2C+G">Guna Lakshminarayanan</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+E+D">Ellie Dingqiao Wen</a>, <a href="/search/cs?searchtype=author&query=Park%2C+J">Jongsoo Park</a>, <a href="/search/cs?searchtype=author&query=Naumov%2C+M">Maxim Naumov</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wenlin Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.02545v4-abstract-short" style="display: inline;"> Scaling laws play an instrumental role in the sustainable improvement in model quality. Unfortunately, recommendation models to date do not exhibit such laws similar to those observed in the domain of large language models, due to the inefficiencies of their upscaling mechanisms. This limitation poses significant challenges in adapting these models to increasingly more complex real-world datasets.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.02545v4-abstract-full').style.display = 'inline'; document.getElementById('2403.02545v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.02545v4-abstract-full" style="display: none;"> Scaling laws play an instrumental role in the sustainable improvement in model quality. Unfortunately, recommendation models to date do not exhibit such laws similar to those observed in the domain of large language models, due to the inefficiencies of their upscaling mechanisms. This limitation poses significant challenges in adapting these models to increasingly more complex real-world datasets. In this paper, we propose an effective network architecture based purely on stacked factorization machines, and a synergistic upscaling strategy, collectively dubbed Wukong, to establish a scaling law in the domain of recommendation. Wukong's unique design makes it possible to capture diverse, any-order of interactions simply through taller and wider layers. We conducted extensive evaluations on six public datasets, and our results demonstrate that Wukong consistently outperforms state-of-the-art models quality-wise. Further, we assessed Wukong's scalability on an internal, large-scale dataset. The results show that Wukong retains its superiority in quality over state-of-the-art models, while holding the scaling law across two orders of magnitude in model complexity, extending beyond 100 GFLOP/example, where prior arts fall short. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.02545v4-abstract-full').style.display = 'none'; document.getElementById('2403.02545v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Nie%2C+J&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Nie%2C+J&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Nie%2C+J&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Nie%2C+J&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Nie%2C+J&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository