CINXE.COM
Search | arXiv e-print repository
<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 200 results for author: <span class="mathjax">Qu, L</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Qu%2C+L">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Qu, L"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Qu%2C+L&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Qu, L"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Qu%2C+L&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Qu%2C+L&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Qu%2C+L&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Qu%2C+L&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Qu%2C+L&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Qu%2C+L&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13415">arXiv:2411.13415</a> <span> [<a href="https://arxiv.org/pdf/2411.13415">pdf</a>, <a href="https://arxiv.org/format/2411.13415">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Unleashing the Power of Large Language Models for Group POI Recommendations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Long%2C+J">Jing Long</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+L">Liang Qu</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+G">Guanhua Ye</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tong Chen</a>, <a href="/search/cs?searchtype=author&query=Nguyen%2C+Q+V+H">Quoc Viet Hung Nguyen</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+H">Hongzhi Yin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13415v1-abstract-short" style="display: inline;"> Group Point-of-Interest (POI) recommendations aim to predict the next POI that satisfies the diverse preferences of a group of users. This task is more challenging than traditional individual POI recommendations due to complex group decision-making and extremely sparse group-level check-in data. Existing methods for group POI recommendations primarily rely on single ID-based features from check-in… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13415v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13415v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13415v1-abstract-full" style="display: none;"> Group Point-of-Interest (POI) recommendations aim to predict the next POI that satisfies the diverse preferences of a group of users. This task is more challenging than traditional individual POI recommendations due to complex group decision-making and extremely sparse group-level check-in data. Existing methods for group POI recommendations primarily rely on single ID-based features from check-in data, capturing only statistical correlations and failing to fully utilize the rich semantic information contained in the check-ins, resulting in suboptimal performance. To this end, we propose a framework that unleashes the power of the Large Language Model (LLM) for context-aware group POI recommendations (LLMGPR). Our approach first introduces POI tokens alongside the original word tokens of the LLM, which are initialized by applying the LLM to the rich information of each POI. We then propose a novel sequencing adapter guided by Quantized Low-Rank Adaptation (QLORA) to modify the LLM. The enhanced LLM can learn sequence representations by combining semantic-enhanced POI tokens and rich contextual information including positional encodings and spatio-temporal differences. This approach can be adapted for learning either group or user representations depending on the sequence type. Furthermore, we enhance group representations by aggregating individual member representations with another QLORA-based aggregation adapter and introducing a self-supervised learning task that predicts the purpose of check-in sequences, alleviating the data sparsity issue. Our experimental results demonstrate that LLMGPR outperforms existing methods, effectively addressing group-level data sparsity and providing superior recommendations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13415v1-abstract-full').style.display = 'none'; document.getElementById('2411.13415v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12205">arXiv:2411.12205</a> <span> [<a href="https://arxiv.org/pdf/2411.12205">pdf</a>, <a href="https://arxiv.org/format/2411.12205">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Sparser Training for On-Device Recommendation Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qu%2C+Y">Yunke Qu</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+L">Liang Qu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tong Chen</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+X">Xiangyu Zhao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jianxin Li</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+H">Hongzhi Yin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12205v1-abstract-short" style="display: inline;"> Recommender systems often rely on large embedding tables that map users and items to dense vectors of uniform size, leading to substantial memory consumption and inefficiencies. This is particularly problematic in memory-constrained environments like mobile and Web of Things (WoT) applications, where scalability and real-time performance are critical. Various research efforts have sought to addres… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12205v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12205v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12205v1-abstract-full" style="display: none;"> Recommender systems often rely on large embedding tables that map users and items to dense vectors of uniform size, leading to substantial memory consumption and inefficiencies. This is particularly problematic in memory-constrained environments like mobile and Web of Things (WoT) applications, where scalability and real-time performance are critical. Various research efforts have sought to address these issues. Although embedding pruning methods utilizing Dynamic Sparse Training (DST) stand out due to their low training and inference costs, consistent sparsity, and end-to-end differentiability, they face key challenges. Firstly, they typically initializes the mask matrix, which is used to prune redundant parameters, with random uniform sparse initialization. This strategy often results in suboptimal performance as it creates unstructured and inefficient connections. Secondly, they tend to favor the users/items sampled in the single batch immediately before weight exploration when they reactivate pruned parameters with large gradient magnitudes, which does not necessarily improve the overall performance. Thirdly, while they use sparse weights during forward passes, they still need to compute dense gradients during backward passes. In this paper, we propose SparseRec, an lightweight embedding method based on DST, to address these issues. Specifically, SparseRec initializes the mask matrix using Nonnegative Matrix Factorization. It accumulates gradients to identify the inactive parameters that can better improve the model performance after activation. Furthermore, it avoids dense gradients during backpropagation by sampling a subset of important vectors. Gradients are calculated only for parameters in this subset, thus maintaining sparsity during training in both forward and backward passes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12205v1-abstract-full').style.display = 'none'; document.getElementById('2411.12205v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06442">arXiv:2411.06442</a> <span> [<a href="https://arxiv.org/pdf/2411.06442">pdf</a>, <a href="https://arxiv.org/format/2411.06442">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Local Implicit Wavelet Transformer for Arbitrary-Scale Super-Resolution </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Duan%2C+M">Minghong Duan</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+L">Linhao Qu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Shaolei Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+M">Manning Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06442v1-abstract-short" style="display: inline;"> Implicit neural representations have recently demonstrated promising potential in arbitrary-scale Super-Resolution (SR) of images. Most existing methods predict the pixel in the SR image based on the queried coordinate and ensemble nearby features, overlooking the importance of incorporating high-frequency prior information in images, which results in limited performance in reconstructing high-fre… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06442v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06442v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06442v1-abstract-full" style="display: none;"> Implicit neural representations have recently demonstrated promising potential in arbitrary-scale Super-Resolution (SR) of images. Most existing methods predict the pixel in the SR image based on the queried coordinate and ensemble nearby features, overlooking the importance of incorporating high-frequency prior information in images, which results in limited performance in reconstructing high-frequency texture details in images. To address this issue, we propose the Local Implicit Wavelet Transformer (LIWT) to enhance the restoration of high-frequency texture details. Specifically, we decompose the features extracted by an encoder into four sub-bands containing different frequency information using Discrete Wavelet Transform (DWT). We then introduce the Wavelet Enhanced Residual Module (WERM) to transform these four sub-bands into high-frequency priors, followed by utilizing the Wavelet Mutual Projected Fusion (WMPF) and the Wavelet-aware Implicit Attention (WIA) to fully exploit the high-frequency prior information for recovering high-frequency details in images. We conducted extensive experiments on benchmark datasets to validate the effectiveness of LIWT. Both qualitative and quantitative results demonstrate that LIWT achieves promising performance in arbitrary-scale SR tasks, outperforming other state-of-the-art methods. The code is available at https://github.com/dmhdmhdmh/LIWT. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06442v1-abstract-full').style.display = 'none'; document.getElementById('2411.06442v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by BMVC 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23861">arXiv:2410.23861</a> <span> [<a href="https://arxiv.org/pdf/2410.23861">pdf</a>, <a href="https://arxiv.org/format/2410.23861">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Audio Is the Achilles' Heel: Red Teaming Audio Large Multimodal Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+H">Hao Yang</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+L">Lizhen Qu</a>, <a href="/search/cs?searchtype=author&query=Shareghi%2C+E">Ehsan Shareghi</a>, <a href="/search/cs?searchtype=author&query=Haffari%2C+G">Gholamreza Haffari</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23861v1-abstract-short" style="display: inline;"> Large Multimodal Models (LMMs) have demonstrated the ability to interact with humans under real-world conditions by combining Large Language Models (LLMs) and modality encoders to align multimodal information (visual and auditory) with text. However, such models raise new safety challenges of whether models that are safety-aligned on text also exhibit consistent safeguards for multimodal inputs. D… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23861v1-abstract-full').style.display = 'inline'; document.getElementById('2410.23861v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23861v1-abstract-full" style="display: none;"> Large Multimodal Models (LMMs) have demonstrated the ability to interact with humans under real-world conditions by combining Large Language Models (LLMs) and modality encoders to align multimodal information (visual and auditory) with text. However, such models raise new safety challenges of whether models that are safety-aligned on text also exhibit consistent safeguards for multimodal inputs. Despite recent safety-alignment research on vision LMMs, the safety of audio LMMs remains under-explored. In this work, we comprehensively red team the safety of five advanced audio LMMs under three settings: (i) harmful questions in both audio and text formats, (ii) harmful questions in text format accompanied by distracting non-speech audio, and (iii) speech-specific jailbreaks. Our results under these settings demonstrate that open-source audio LMMs suffer an average attack success rate of 69.14% on harmful audio questions, and exhibit safety vulnerabilities when distracted with non-speech audio noise. Our speech-specific jailbreaks on Gemini-1.5-Pro achieve an attack success rate of 70.67% on the harmful query benchmark. We provide insights on what could cause these reported safety-misalignments. Warning: this paper contains offensive examples. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23861v1-abstract-full').style.display = 'none'; document.getElementById('2410.23861v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.22057">arXiv:2410.22057</a> <span> [<a href="https://arxiv.org/pdf/2410.22057">pdf</a>, <a href="https://arxiv.org/format/2410.22057">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> FANCL: Feature-Guided Attention Network with Curriculum Learning for Brain Metastases Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zijiang Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiaoyu Liu</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+L">Linhao Qu</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yonghong Shi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.22057v1-abstract-short" style="display: inline;"> Accurate segmentation of brain metastases (BMs) in MR image is crucial for the diagnosis and follow-up of patients. Methods based on deep convolutional neural networks (CNNs) have achieved high segmentation performance. However, due to the loss of critical feature information caused by convolutional and pooling operations, CNNs still face great challenges in small BMs segmentation. Besides, BMs ar… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22057v1-abstract-full').style.display = 'inline'; document.getElementById('2410.22057v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.22057v1-abstract-full" style="display: none;"> Accurate segmentation of brain metastases (BMs) in MR image is crucial for the diagnosis and follow-up of patients. Methods based on deep convolutional neural networks (CNNs) have achieved high segmentation performance. However, due to the loss of critical feature information caused by convolutional and pooling operations, CNNs still face great challenges in small BMs segmentation. Besides, BMs are irregular and easily confused with healthy tissues, which makes it difficult for the model to effectively learn tumor structure during training. To address these issues, this paper proposes a novel model called feature-guided attention network with curriculum learning (FANCL). Based on CNNs, FANCL utilizes the input image and its feature to establish the intrinsic connections between metastases of different sizes, which can effectively compensate for the loss of high-level feature from small tumors with the information of large tumors. Furthermore, FANCL applies the voxel-level curriculum learning strategy to help the model gradually learn the structure and details of BMs. And baseline models of varying depths are employed as curriculum-mining networks for organizing the curriculum progression. The evaluation results on the BraTS-METS 2023 dataset indicate that FANCL significantly improves the segmentation performance, confirming the effectiveness of our method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22057v1-abstract-full').style.display = 'none'; document.getElementById('2410.22057v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.12588">arXiv:2410.12588</a> <span> [<a href="https://arxiv.org/pdf/2410.12588">pdf</a>, <a href="https://arxiv.org/format/2410.12588">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Operating Systems">cs.OS</span> </div> </div> <p class="title is-5 mathjax"> FALCON: Pinpointing and Mitigating Stragglers for Large-Scale Hybrid-Parallel Training </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+T">Tianyuan Wu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wei Wang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Y">Yinghao Yu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Siran Yang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+W">Wenchao Wu</a>, <a href="/search/cs?searchtype=author&query=Duan%2C+Q">Qinkai Duan</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+G">Guodong Yang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jiamang Wang</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+L">Lin Qu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Liping Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.12588v1-abstract-short" style="display: inline;"> Fail-slows, or stragglers, are common but largely unheeded problems in large-scale hybrid-parallel training that spans thousands of GPU servers and runs for weeks to months. Yet, these problems are not well studied, nor can they be quickly detected and effectively mitigated. In this paper, we first present a characterization study on a shared production cluster with over 10,000 GPUs1. We find that… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12588v1-abstract-full').style.display = 'inline'; document.getElementById('2410.12588v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.12588v1-abstract-full" style="display: none;"> Fail-slows, or stragglers, are common but largely unheeded problems in large-scale hybrid-parallel training that spans thousands of GPU servers and runs for weeks to months. Yet, these problems are not well studied, nor can they be quickly detected and effectively mitigated. In this paper, we first present a characterization study on a shared production cluster with over 10,000 GPUs1. We find that fail-slows are caused by various CPU/GPU computation and cross-node networking issues, lasting from tens of seconds to nearly ten hours, and collectively delaying the average job completion time by 1.34%. The current practice is to manually detect these fail-slows and simply treat them as fail-stops using a checkpoint-and-restart failover approach, which are labor-intensive and time-consuming. In this paper, we propose FALCON, a framework that rapidly identifies fail-slowed GPUs and/or communication links, and effectively tackles them with a novel multi-level mitigation mechanism, all without human intervention. We have applied FALCON to detect human-labeled fail-slows in a production cluster with over 99% accuracy. Cluster deployment further demonstrates that FALCON effectively handles manually injected fail-slows, mitigating the training slowdown by 60.1%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12588v1-abstract-full').style.display = 'none'; document.getElementById('2410.12588v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages, 20 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.12458">arXiv:2410.12458</a> <span> [<a href="https://arxiv.org/pdf/2410.12458">pdf</a>, <a href="https://arxiv.org/format/2410.12458">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> The Best of Both Worlds: Bridging Quality and Diversity in Data Selection with Bipartite Graph </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+M">Minghao Wu</a>, <a href="/search/cs?searchtype=author&query=Vu%2C+T">Thuy-Trang Vu</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+L">Lizhen Qu</a>, <a href="/search/cs?searchtype=author&query=Haffari%2C+G">Gholamreza Haffari</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.12458v1-abstract-short" style="display: inline;"> The performance of large language models (LLMs) in natural language processing (NLP) tasks is significantly influenced by the quality and diversity of data used for supervised fine-tuning (SFT). Current data selection methods often focus solely on quality or diversity, leading to underperforming models due to suboptimal training data. In this paper, we introduce GraphFilter, a novel method that re… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12458v1-abstract-full').style.display = 'inline'; document.getElementById('2410.12458v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.12458v1-abstract-full" style="display: none;"> The performance of large language models (LLMs) in natural language processing (NLP) tasks is significantly influenced by the quality and diversity of data used for supervised fine-tuning (SFT). Current data selection methods often focus solely on quality or diversity, leading to underperforming models due to suboptimal training data. In this paper, we introduce GraphFilter, a novel method that represents the dataset as a bipartite graph, linking sentences to their constituent n-grams. This representation effectively captures the relationships between sentences and linguistic patterns, facilitating the selection of sentences that enhance n-gram diversity. To balance quality and diversity during selection, we propose a priority function that combines the quality metric with the diversity metric in a multiplicative manner. GraphFilter iteratively selects high-priority sentences, updates the bipartite graph by removing covered n-grams, and re-calculates priorities to reflect the evolving data landscape. We conduct extensive experiments using three model backbones across six widely used benchmarks. The results demonstrate that GraphFilter outperforms all nine baseline approaches, achieving superior model performance and computational efficiency. Our analyses validate the effectiveness of our design choices, examine the subsets selected by GraphFilter and other methods, highlight the importance of instruction diversity, and explore the role of quality and diversity in relation to subset sizes. GraphFilter establishes a new foundation for effective data selection strategies, encouraging further research in data selection for LLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12458v1-abstract-full').style.display = 'none'; document.getElementById('2410.12458v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">19 pages, 5 figures, 5 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.11459">arXiv:2410.11459</a> <span> [<a href="https://arxiv.org/pdf/2410.11459">pdf</a>, <a href="https://arxiv.org/format/2410.11459">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Jigsaw Puzzles: Splitting Harmful Questions to Jailbreak Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+H">Hao Yang</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+L">Lizhen Qu</a>, <a href="/search/cs?searchtype=author&query=Shareghi%2C+E">Ehsan Shareghi</a>, <a href="/search/cs?searchtype=author&query=Haffari%2C+G">Gholamreza Haffari</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.11459v1-abstract-short" style="display: inline;"> Large language models (LLMs) have exhibited outstanding performance in engaging with humans and addressing complex questions by leveraging their vast implicit knowledge and robust reasoning capabilities. However, such models are vulnerable to jailbreak attacks, leading to the generation of harmful responses. Despite recent research on single-turn jailbreak strategies to facilitate the development… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11459v1-abstract-full').style.display = 'inline'; document.getElementById('2410.11459v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.11459v1-abstract-full" style="display: none;"> Large language models (LLMs) have exhibited outstanding performance in engaging with humans and addressing complex questions by leveraging their vast implicit knowledge and robust reasoning capabilities. However, such models are vulnerable to jailbreak attacks, leading to the generation of harmful responses. Despite recent research on single-turn jailbreak strategies to facilitate the development of defence mechanisms, the challenge of revealing vulnerabilities under multi-turn setting remains relatively under-explored. In this work, we propose Jigsaw Puzzles (JSP), a straightforward yet effective multi-turn jailbreak strategy against the advanced LLMs. JSP splits questions into harmless fractions as the input of each turn, and requests LLMs to reconstruct and respond to questions under multi-turn interaction. Our experimental results demonstrate that the proposed JSP jailbreak bypasses original safeguards against explicitly harmful content, achieving an average attack success rate of 93.76% on 189 harmful queries across 5 advanced LLMs (Gemini-1.5-Pro, Llama-3.1-70B, GPT-4, GPT-4o, GPT-4o-mini). Moreover, JSP achieves a state-of-the-art attack success rate of 92% on GPT-4 on the harmful query benchmark, and exhibits strong resistant to defence strategies. Warning: this paper contains offensive examples. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11459v1-abstract-full').style.display = 'none'; document.getElementById('2410.11459v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.10130">arXiv:2410.10130</a> <span> [<a href="https://arxiv.org/pdf/2410.10130">pdf</a>, <a href="https://arxiv.org/format/2410.10130">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> DecKG: Decentralized Collaborative Learning with Knowledge Graph Enhancement for POI Recommendation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zheng%2C+R">Ruiqi Zheng</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+L">Liang Qu</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+G">Guanhua Ye</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tong Chen</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yuhui Shi</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+H">Hongzhi Yin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.10130v1-abstract-short" style="display: inline;"> Decentralized collaborative learning for Point-of-Interest (POI) recommendation has gained research interest due to its advantages in privacy preservation and efficiency, as it keeps data locally and leverages collaborative learning among clients to train models in a decentralized manner. However, since local data is often limited and insufficient for training accurate models, a common solution is… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10130v1-abstract-full').style.display = 'inline'; document.getElementById('2410.10130v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.10130v1-abstract-full" style="display: none;"> Decentralized collaborative learning for Point-of-Interest (POI) recommendation has gained research interest due to its advantages in privacy preservation and efficiency, as it keeps data locally and leverages collaborative learning among clients to train models in a decentralized manner. However, since local data is often limited and insufficient for training accurate models, a common solution is integrating external knowledge as auxiliary information to enhance model performance. Nevertheless, this solution poses challenges for decentralized collaborative learning. Due to private nature of local data, identifying relevant auxiliary information specific to each user is non-trivial. Furthermore, resource-constrained local devices struggle to accommodate all auxiliary information, which places heavy burden on local storage. To fill the gap, we propose a novel decentralized collaborative learning with knowledge graph enhancement framework for POI recommendation (DecKG). Instead of directly uploading interacted items, users generate desensitized check-in data by uploading general categories of interacted items and sampling similar items from same category. The server then pretrains KG without sensitive user-item interactions and deploys relevant partitioned sub-KGs to individual users. Entities are further refined on the device, allowing client to client communication to exchange knowledge learned from local data and sub-KGs. Evaluations across two real-world datasets demonstrate DecKG's effectiveness recommendation performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10130v1-abstract-full').style.display = 'none'; document.getElementById('2410.10130v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.06497">arXiv:2410.06497</a> <span> [<a href="https://arxiv.org/pdf/2410.06497">pdf</a>, <a href="https://arxiv.org/format/2410.06497">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> ERCache: An Efficient and Reliable Caching Framework for Large-Scale User Representations in Meta's Ads System </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+F">Fang Zhou</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yaning Huang</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+D">Dong Liang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Dai Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhongke Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+K">Kai Wang</a>, <a href="/search/cs?searchtype=author&query=Xin%2C+X">Xiao Xin</a>, <a href="/search/cs?searchtype=author&query=Aboelela%2C+A">Abdallah Aboelela</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Zheliang Jiang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yang Wang</a>, <a href="/search/cs?searchtype=author&query=Song%2C+J">Jeff Song</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wei Zhang</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+C">Chen Liang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Huayu Li</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+C">ChongLin Sun</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+H">Hang Yang</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+L">Lei Qu</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+Z">Zhan Shu</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+M">Mindi Yuan</a>, <a href="/search/cs?searchtype=author&query=Maccherani%2C+E">Emanuele Maccherani</a>, <a href="/search/cs?searchtype=author&query=Hayat%2C+T">Taha Hayat</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+J">John Guo</a>, <a href="/search/cs?searchtype=author&query=Puvvada%2C+V">Varna Puvvada</a>, <a href="/search/cs?searchtype=author&query=Pashkevich%2C+U">Uladzimir Pashkevich</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.06497v1-abstract-short" style="display: inline;"> The increasing complexity of deep learning models used for calculating user representations presents significant challenges, particularly with limited computational resources and strict service-level agreements (SLAs). Previous research efforts have focused on optimizing model inference but have overlooked a critical question: is it necessary to perform user model inference for every ad request in… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06497v1-abstract-full').style.display = 'inline'; document.getElementById('2410.06497v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.06497v1-abstract-full" style="display: none;"> The increasing complexity of deep learning models used for calculating user representations presents significant challenges, particularly with limited computational resources and strict service-level agreements (SLAs). Previous research efforts have focused on optimizing model inference but have overlooked a critical question: is it necessary to perform user model inference for every ad request in large-scale social networks? To address this question and these challenges, we first analyze user access patterns at Meta and find that most user model inferences occur within a short timeframe. T his observation reveals a triangular relationship among model complexity, embedding freshness, and service SLAs. Building on this insight, we designed, implemented, and evaluated ERCache, an efficient and robust caching framework for large-scale user representations in ads recommendation systems on social networks. ERCache categorizes cache into direct and failover types and applies customized settings and eviction policies for each model, effectively balancing model complexity, embedding freshness, and service SLAs, even considering the staleness introduced by caching. ERCache has been deployed at Meta for over six months, supporting more than 30 ranking models while efficiently conserving computational resources and complying with service SLA requirements. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06497v1-abstract-full').style.display = 'none'; document.getElementById('2410.06497v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.03049">arXiv:2410.03049</a> <span> [<a href="https://arxiv.org/pdf/2410.03049">pdf</a>, <a href="https://arxiv.org/format/2410.03049">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Scalable Frame-based Construction of Sociocultural NormBases for Socially-Aware Dialogues </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qu%2C+S">Shilin Qu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Weiqing Wang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+X">Xin Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhan%2C+H">Haolan Zhan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhuang Li</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+L">Lizhen Qu</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+L">Linhao Luo</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yuan-Fang Li</a>, <a href="/search/cs?searchtype=author&query=Haffari%2C+G">Gholamreza Haffari</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.03049v1-abstract-short" style="display: inline;"> Sociocultural norms serve as guiding principles for personal conduct in social interactions, emphasizing respect, cooperation, and appropriate behavior, which is able to benefit tasks including conversational information retrieval, contextual information retrieval and retrieval-enhanced machine learning. We propose a scalable approach for constructing a Sociocultural Norm (SCN) Base using Large La… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03049v1-abstract-full').style.display = 'inline'; document.getElementById('2410.03049v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.03049v1-abstract-full" style="display: none;"> Sociocultural norms serve as guiding principles for personal conduct in social interactions, emphasizing respect, cooperation, and appropriate behavior, which is able to benefit tasks including conversational information retrieval, contextual information retrieval and retrieval-enhanced machine learning. We propose a scalable approach for constructing a Sociocultural Norm (SCN) Base using Large Language Models (LLMs) for socially aware dialogues. We construct a comprehensive and publicly accessible Chinese Sociocultural NormBase. Our approach utilizes socially aware dialogues, enriched with contextual frames, as the primary data source to constrain the generating process and reduce the hallucinations. This enables extracting of high-quality and nuanced natural-language norm statements, leveraging the pragmatic implications of utterances with respect to the situation. As real dialogue annotated with gold frames are not readily available, we propose using synthetic data. Our empirical results show: (i) the quality of the SCNs derived from synthetic data is comparable to that from real dialogues annotated with gold frames, and (ii) the quality of the SCNs extracted from real data, annotated with either silver (predicted) or gold frames, surpasses that without the frame annotations. We further show the effectiveness of the extracted SCNs in a RAG-based (Retrieval-Augmented Generation) model to reason about multiple downstream dialogue tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03049v1-abstract-full').style.display = 'none'; document.getElementById('2410.03049v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> TOMM 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.01463">arXiv:2410.01463</a> <span> [<a href="https://arxiv.org/pdf/2410.01463">pdf</a>, <a href="https://arxiv.org/format/2410.01463">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Selective Aggregation for Low-Rank Adaptation in Federated Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+P">Pengxin Guo</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+S">Shuang Zeng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yanran Wang</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+H">Huijie Fan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+F">Feifei Wang</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+L">Liangqiong Qu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.01463v2-abstract-short" style="display: inline;"> We investigate LoRA in federated learning through the lens of the asymmetry analysis of the learned $A$ and $B$ matrices. In doing so, we uncover that $A$ matrices are responsible for learning general knowledge, while $B$ matrices focus on capturing client-specific knowledge. Based on this finding, we introduce Federated Share-A Low-Rank Adaptation (FedSA-LoRA), which employs two low-rank trainabl… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01463v2-abstract-full').style.display = 'inline'; document.getElementById('2410.01463v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.01463v2-abstract-full" style="display: none;"> We investigate LoRA in federated learning through the lens of the asymmetry analysis of the learned $A$ and $B$ matrices. In doing so, we uncover that $A$ matrices are responsible for learning general knowledge, while $B$ matrices focus on capturing client-specific knowledge. Based on this finding, we introduce Federated Share-A Low-Rank Adaptation (FedSA-LoRA), which employs two low-rank trainable matrices $A$ and $B$ to model the weight update, but only $A$ matrices are shared with the server for aggregation. Moreover, we delve into the relationship between the learned $A$ and $B$ matrices in other LoRA variants, such as rsLoRA and VeRA, revealing a consistent pattern. Consequently, we extend our FedSA-LoRA method to these LoRA variants, resulting in FedSA-rsLoRA and FedSA-VeRA. In this way, we establish a general paradigm for integrating LoRA with FL, offering guidance for future work on subsequent LoRA variants combined with FL. Extensive experimental results on natural language understanding and generation tasks demonstrate the effectiveness of the proposed method. Our code is available at https://github.com/Pengxin-Guo/FedSA-LoRA. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01463v2-abstract-full').style.display = 'none'; document.getElementById('2410.01463v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.19720">arXiv:2409.19720</a> <span> [<a href="https://arxiv.org/pdf/2409.19720">pdf</a>, <a href="https://arxiv.org/format/2409.19720">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> FAST: A Dual-tier Few-Shot Learning Paradigm for Whole Slide Image Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fu%2C+K">Kexue Fu</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+X">Xiaoyuan Luo</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+L">Linhao Qu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shuo Wang</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+Y">Ying Xiong</a>, <a href="/search/cs?searchtype=author&query=Maglogiannis%2C+I">Ilias Maglogiannis</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+L">Longxiang Gao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+M">Manning Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.19720v1-abstract-short" style="display: inline;"> The expensive fine-grained annotation and data scarcity have become the primary obstacles for the widespread adoption of deep learning-based Whole Slide Images (WSI) classification algorithms in clinical practice. Unlike few-shot learning methods in natural images that can leverage the labels of each image, existing few-shot WSI classification methods only utilize a small number of fine-grained la… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19720v1-abstract-full').style.display = 'inline'; document.getElementById('2409.19720v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.19720v1-abstract-full" style="display: none;"> The expensive fine-grained annotation and data scarcity have become the primary obstacles for the widespread adoption of deep learning-based Whole Slide Images (WSI) classification algorithms in clinical practice. Unlike few-shot learning methods in natural images that can leverage the labels of each image, existing few-shot WSI classification methods only utilize a small number of fine-grained labels or weakly supervised slide labels for training in order to avoid expensive fine-grained annotation. They lack sufficient mining of available WSIs, severely limiting WSI classification performance. To address the above issues, we propose a novel and efficient dual-tier few-shot learning paradigm for WSI classification, named FAST. FAST consists of a dual-level annotation strategy and a dual-branch classification framework. Firstly, to avoid expensive fine-grained annotation, we collect a very small number of WSIs at the slide level, and annotate an extremely small number of patches. Then, to fully mining the available WSIs, we use all the patches and available patch labels to build a cache branch, which utilizes the labeled patches to learn the labels of unlabeled patches and through knowledge retrieval for patch classification. In addition to the cache branch, we also construct a prior branch that includes learnable prompt vectors, using the text encoder of visual-language models for patch classification. Finally, we integrate the results from both branches to achieve WSI classification. Extensive experiments on binary and multi-class datasets demonstrate that our proposed method significantly surpasses existing few-shot classification methods and approaches the accuracy of fully supervised methods with only 0.22$\%$ annotation costs. All codes and models will be publicly available on https://github.com/fukexue/FAST. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19720v1-abstract-full').style.display = 'none'; document.getElementById('2409.19720v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.19676">arXiv:2409.19676</a> <span> [<a href="https://arxiv.org/pdf/2409.19676">pdf</a>, <a href="https://arxiv.org/format/2409.19676">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> See Detail Say Clear: Towards Brain CT Report Generation via Pathological Clue-driven Representation Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zheng%2C+C">Chengxin Zheng</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+J">Junzhong Ji</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yanzhao Shi</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiaodan Zhang</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+L">Liangqiong Qu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.19676v2-abstract-short" style="display: inline;"> Brain CT report generation is significant to aid physicians in diagnosing cranial diseases. Recent studies concentrate on handling the consistency between visual and textual pathological features to improve the coherence of report. However, there exist some challenges: 1) Redundant visual representing: Massive irrelevant areas in 3D scans distract models from representing salient visual contexts.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19676v2-abstract-full').style.display = 'inline'; document.getElementById('2409.19676v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.19676v2-abstract-full" style="display: none;"> Brain CT report generation is significant to aid physicians in diagnosing cranial diseases. Recent studies concentrate on handling the consistency between visual and textual pathological features to improve the coherence of report. However, there exist some challenges: 1) Redundant visual representing: Massive irrelevant areas in 3D scans distract models from representing salient visual contexts. 2) Shifted semantic representing: Limited medical corpus causes difficulties for models to transfer the learned textual representations to generative layers. This study introduces a Pathological Clue-driven Representation Learning (PCRL) model to build cross-modal representations based on pathological clues and naturally adapt them for accurate report generation. Specifically, we construct pathological clues from perspectives of segmented regions, pathological entities, and report themes, to fully grasp visual pathological patterns and learn cross-modal feature representations. To adapt the representations for the text generation task, we bridge the gap between representation learning and report generation by using a unified large language model (LLM) with task-tailored instructions. These crafted instructions enable the LLM to be flexibly fine-tuned across tasks and smoothly transfer the semantic representation for report generation. Experiments demonstrate that our method outperforms previous methods and achieves SoTA performance. Our code is available at "https://github.com/Chauncey-Jheng/PCRL-MRG". <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19676v2-abstract-full').style.display = 'none'; document.getElementById('2409.19676v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Our work has been accepted by EMNLP2024 findings</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.12979">arXiv:2409.12979</a> <span> [<a href="https://arxiv.org/pdf/2409.12979">pdf</a>, <a href="https://arxiv.org/format/2409.12979">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Can we only use guideline instead of shot in prompt? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jiaxiang Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Song Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhucong Li</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+W">Wayne Xiong</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+L">Lizhen Qu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zenglin Xu</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+Y">Yuan Qi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.12979v1-abstract-short" style="display: inline;"> Currently, prompting techniques can be mainly divided into two categories:1)shot method implicitly inspires the model to answer the question by mimicing the steps in the given example, e.g., the few-shot CoT. 2) Guideline method explicitly instructs the model to reason by following guidelines, which contains succinct and concise task-specific knowledge. Shot method is prone to difficulties in term… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12979v1-abstract-full').style.display = 'inline'; document.getElementById('2409.12979v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.12979v1-abstract-full" style="display: none;"> Currently, prompting techniques can be mainly divided into two categories:1)shot method implicitly inspires the model to answer the question by mimicing the steps in the given example, e.g., the few-shot CoT. 2) Guideline method explicitly instructs the model to reason by following guidelines, which contains succinct and concise task-specific knowledge. Shot method is prone to difficulties in terms of selection of shots type, the number of shots, and the design of the reasoning steps, so a question arises: can we only use guideline instead of shot in the prompt? To this end, we propose the FGT framework to automatically learn task-specific guidelines from dataset consisting of Feedback, Guideline, and Tree-gather agents. First, the feedback agent is designed to evaluate the outcomes, both right and wrong, of each Q&A to gather insights guiding more effective optimization strategies. Next, the guideline agent is tasked with deriving guidelines from each piece of feedback and storing them in local memory. Lastly, the tree-gather agent aggregates all guidelines hierarchically through a tree structure, ultimately obtaining all unduplicated guidelines from a global perspective. In addition, we induce the model to generate intermediate processes to ensure the reasoning consistent with the guidelines. Experimental results demonstrate that our approach achieves superior performance across multiple tasks, thereby highlighting the effectiveness of using the guidelines in prompt. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12979v1-abstract-full').style.display = 'none'; document.getElementById('2409.12979v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.10196">arXiv:2409.10196</a> <span> [<a href="https://arxiv.org/pdf/2409.10196">pdf</a>, <a href="https://arxiv.org/format/2409.10196">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> NEUSIS: A Compositional Neuro-Symbolic Framework for Autonomous Perception, Reasoning, and Planning in Complex UAV Search Missions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cai%2C+Z">Zhixi Cai</a>, <a href="/search/cs?searchtype=author&query=Cardenas%2C+C+R">Cristian Rojas Cardenas</a>, <a href="/search/cs?searchtype=author&query=Leo%2C+K">Kevin Leo</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chenyuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Backman%2C+K">Kal Backman</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Hanbing Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Boying Li</a>, <a href="/search/cs?searchtype=author&query=Ghorbanali%2C+M">Mahsa Ghorbanali</a>, <a href="/search/cs?searchtype=author&query=Datta%2C+S">Stavya Datta</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+L">Lizhen Qu</a>, <a href="/search/cs?searchtype=author&query=Santiago%2C+J+G">Julian Gutierrez Santiago</a>, <a href="/search/cs?searchtype=author&query=Ignatiev%2C+A">Alexey Ignatiev</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yuan-Fang Li</a>, <a href="/search/cs?searchtype=author&query=Vered%2C+M">Mor Vered</a>, <a href="/search/cs?searchtype=author&query=Stuckey%2C+P+J">Peter J Stuckey</a>, <a href="/search/cs?searchtype=author&query=de+la+Banda%2C+M+G">Maria Garcia de la Banda</a>, <a href="/search/cs?searchtype=author&query=Rezatofighi%2C+H">Hamid Rezatofighi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.10196v1-abstract-short" style="display: inline;"> This paper addresses the problem of autonomous UAV search missions, where a UAV must locate specific Entities of Interest (EOIs) within a time limit, based on brief descriptions in large, hazard-prone environments with keep-out zones. The UAV must perceive, reason, and make decisions with limited and uncertain information. We propose NEUSIS, a compositional neuro-symbolic system designed for inter… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.10196v1-abstract-full').style.display = 'inline'; document.getElementById('2409.10196v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.10196v1-abstract-full" style="display: none;"> This paper addresses the problem of autonomous UAV search missions, where a UAV must locate specific Entities of Interest (EOIs) within a time limit, based on brief descriptions in large, hazard-prone environments with keep-out zones. The UAV must perceive, reason, and make decisions with limited and uncertain information. We propose NEUSIS, a compositional neuro-symbolic system designed for interpretable UAV search and navigation in realistic scenarios. NEUSIS integrates neuro-symbolic visual perception, reasoning, and grounding (GRiD) to process raw sensory inputs, maintains a probabilistic world model for environment representation, and uses a hierarchical planning component (SNaC) for efficient path planning. Experimental results from simulated urban search missions using AirSim and Unreal Engine show that NEUSIS outperforms a state-of-the-art (SOTA) vision-language model and a SOTA search planning model in success rate, search efficiency, and 3D localization. These results demonstrate the effectiveness of our compositional neuro-symbolic approach in handling complex, real-world scenarios, making it a promising solution for autonomous UAV systems in search missions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.10196v1-abstract-full').style.display = 'none'; document.getElementById('2409.10196v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.07773">arXiv:2409.07773</a> <span> [<a href="https://arxiv.org/pdf/2409.07773">pdf</a>, <a href="https://arxiv.org/format/2409.07773">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> PDC-FRS: Privacy-preserving Data Contribution for Federated Recommender System </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+C">Chaoqun Yang</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+W">Wei Yuan</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+L">Liang Qu</a>, <a href="/search/cs?searchtype=author&query=Nguyen%2C+T+T">Thanh Tam Nguyen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.07773v1-abstract-short" style="display: inline;"> Federated recommender systems (FedRecs) have emerged as a popular research direction for protecting users' privacy in on-device recommendations. In FedRecs, users keep their data locally and only contribute their local collaborative information by uploading model parameters to a central server. While this rigid framework protects users' raw data during training, it severely compromises the recomme… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.07773v1-abstract-full').style.display = 'inline'; document.getElementById('2409.07773v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.07773v1-abstract-full" style="display: none;"> Federated recommender systems (FedRecs) have emerged as a popular research direction for protecting users' privacy in on-device recommendations. In FedRecs, users keep their data locally and only contribute their local collaborative information by uploading model parameters to a central server. While this rigid framework protects users' raw data during training, it severely compromises the recommendation model's performance due to the following reasons: (1) Due to the power law distribution nature of user behavior data, individual users have few data points to train a recommendation model, resulting in uploaded model updates that may be far from optimal; (2) As each user's uploaded parameters are learned from local data, which lacks global collaborative information, relying solely on parameter aggregation methods such as FedAvg to fuse global collaborative information may be suboptimal. To bridge this performance gap, we propose a novel federated recommendation framework, PDC-FRS. Specifically, we design a privacy-preserving data contribution mechanism that allows users to share their data with a differential privacy guarantee. Based on the shared but perturbed data, an auxiliary model is trained in parallel with the original federated recommendation process. This auxiliary model enhances FedRec by augmenting each user's local dataset and integrating global collaborative information. To demonstrate the effectiveness of PDC-FRS, we conduct extensive experiments on two widely used recommendation datasets. The empirical results showcase the superiority of PDC-FRS compared to baseline methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.07773v1-abstract-full').style.display = 'none'; document.getElementById('2409.07773v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.04473">arXiv:2409.04473</a> <span> [<a href="https://arxiv.org/pdf/2409.04473">pdf</a>, <a href="https://arxiv.org/format/2409.04473">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Learning in Order! A Sequential Strategy to Learn Invariant Features for Multimodal Sentiment Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+X">Xianbing Zhao</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+L">Lizhen Qu</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+T">Tao Feng</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+J">Jianfei Cai</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+B">Buzhou Tang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.04473v1-abstract-short" style="display: inline;"> This work proposes a novel and simple sequential learning strategy to train models on videos and texts for multimodal sentiment analysis. To estimate sentiment polarities on unseen out-of-distribution data, we introduce a multimodal model that is trained either in a single source domain or multiple source domains using our learning strategy. This strategy starts with learning domain invariant feat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.04473v1-abstract-full').style.display = 'inline'; document.getElementById('2409.04473v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.04473v1-abstract-full" style="display: none;"> This work proposes a novel and simple sequential learning strategy to train models on videos and texts for multimodal sentiment analysis. To estimate sentiment polarities on unseen out-of-distribution data, we introduce a multimodal model that is trained either in a single source domain or multiple source domains using our learning strategy. This strategy starts with learning domain invariant features from text, followed by learning sparse domain-agnostic features from videos, assisted by the selected features learned in text. Our experimental results demonstrate that our model achieves significantly better performance than the state-of-the-art approaches on average in both single-source and multi-source settings. Our feature selection procedure favors the features that are independent to each other and are strongly correlated with their polarity labels. To facilitate research on this topic, the source code of this work will be publicly available upon acceptance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.04473v1-abstract-full').style.display = 'none'; document.getElementById('2409.04473v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.01020">arXiv:2409.01020</a> <span> [<a href="https://arxiv.org/pdf/2409.01020">pdf</a>, <a href="https://arxiv.org/format/2409.01020">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Fed-MUnet: Multi-modal Federated Unet for Brain Tumor Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+R">Ruojun Zhou</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+L">Lisha Qu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Lei Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Ziming Li</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+H">Hongwei Yu</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+B">Bing Luo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.01020v1-abstract-short" style="display: inline;"> Deep learning-based techniques have been widely utilized for brain tumor segmentation using both single and multi-modal Magnetic Resonance Imaging (MRI) images. Most current studies focus on centralized training due to the intrinsic challenge of data sharing across clinics. To mitigate privacy concerns, researchers have introduced Federated Learning (FL) methods to brain tumor segmentation tasks.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.01020v1-abstract-full').style.display = 'inline'; document.getElementById('2409.01020v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.01020v1-abstract-full" style="display: none;"> Deep learning-based techniques have been widely utilized for brain tumor segmentation using both single and multi-modal Magnetic Resonance Imaging (MRI) images. Most current studies focus on centralized training due to the intrinsic challenge of data sharing across clinics. To mitigate privacy concerns, researchers have introduced Federated Learning (FL) methods to brain tumor segmentation tasks. However, currently such methods are focusing on single modal MRI, with limited study on multi-modal MRI. The challenges include complex structure, large-scale parameters, and overfitting issues of the FL based methods using multi-modal MRI. To address the above challenges, we propose a novel multi-modal FL framework for brain tumor segmentation (Fed-MUnet) that is suitable for FL training. We evaluate our approach with the BraTS2022 datasets, which are publicly available. The experimental results demonstrate that our framework achieves FL nature of distributed learning and privacy preserving. For the enhancing tumor, tumor core and whole tumor, the mean of five major metrics were 87.5%, 90.6% and 92.2%, respectively, which were higher than SOTA methods while preserving privacy. In terms of parameters count, quantity of floating-point operations (FLOPs) and inference, Fed-MUnet is Pareto optimal compared with the state-of-the-art segmentation backbone while achieves higher performance and tackles privacy issue. Our codes are open-sourced at https://github.com/Arnold-Jun/Fed-MUnet. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.01020v1-abstract-full').style.display = 'none'; document.getElementById('2409.01020v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">6 pages, 3 figures, 2 tables. It was accepted by 2024 IEEE International Conference on E-health Networking, Application & Services (HealthCom)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.12300">arXiv:2408.12300</a> <span> [<a href="https://arxiv.org/pdf/2408.12300">pdf</a>, <a href="https://arxiv.org/format/2408.12300">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Tackling Data Heterogeneity in Federated Learning via Loss Decomposition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zeng%2C+S">Shuang Zeng</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+P">Pengxin Guo</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shuai Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jianbo Wang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yuyin Zhou</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+L">Liangqiong Qu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.12300v2-abstract-short" style="display: inline;"> Federated Learning (FL) is a rising approach towards collaborative and privacy-preserving machine learning where large-scale medical datasets remain localized to each client. However, the issue of data heterogeneity among clients often compels local models to diverge, leading to suboptimal global models. To mitigate the impact of data heterogeneity on FL performance, we start with analyzing how FL… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.12300v2-abstract-full').style.display = 'inline'; document.getElementById('2408.12300v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.12300v2-abstract-full" style="display: none;"> Federated Learning (FL) is a rising approach towards collaborative and privacy-preserving machine learning where large-scale medical datasets remain localized to each client. However, the issue of data heterogeneity among clients often compels local models to diverge, leading to suboptimal global models. To mitigate the impact of data heterogeneity on FL performance, we start with analyzing how FL training influence FL performance by decomposing the global loss into three terms: local loss, distribution shift loss and aggregation loss. Remarkably, our loss decomposition reveals that existing local training-based FL methods attempt to reduce the distribution shift loss, while the global aggregation-based FL methods propose better aggregation strategies to reduce the aggregation loss. Nevertheless, a comprehensive joint effort to minimize all three terms is currently limited in the literature, leading to subpar performance when dealing with data heterogeneity challenges. To fill this gap, we propose a novel FL method based on global loss decomposition, called FedLD, to jointly reduce these three loss terms. Our FedLD involves a margin control regularization in local training to reduce the distribution shift loss, and a principal gradient-based server aggregation strategy to reduce the aggregation loss. Notably, under different levels of data heterogeneity, our strategies achieve better and more robust performance on retinal and chest X-ray classification compared to other FL algorithms. Our code is available at https://github.com/Zeng-Shuang/FedLD. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.12300v2-abstract-full').style.display = 'none'; document.getElementById('2408.12300v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Modify some typos (MICCAI 2024)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.11505">arXiv:2408.11505</a> <span> [<a href="https://arxiv.org/pdf/2408.11505">pdf</a>, <a href="https://arxiv.org/format/2408.11505">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MSCPT: Few-shot Whole Slide Image Classification with Multi-scale and Context-focused Prompt Tuning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Han%2C+M">Minghao Han</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+L">Linhao Qu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+D">Dingkang Yang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xukun Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiaoying Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Lihua Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.11505v1-abstract-short" style="display: inline;"> Multiple instance learning (MIL) has become a standard paradigm for weakly supervised classification of whole slide images (WSI). However, this paradigm relies on the use of a large number of labelled WSIs for training. The lack of training data and the presence of rare diseases present significant challenges for these methods. Prompt tuning combined with the pre-trained Vision-Language models (VL… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.11505v1-abstract-full').style.display = 'inline'; document.getElementById('2408.11505v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.11505v1-abstract-full" style="display: none;"> Multiple instance learning (MIL) has become a standard paradigm for weakly supervised classification of whole slide images (WSI). However, this paradigm relies on the use of a large number of labelled WSIs for training. The lack of training data and the presence of rare diseases present significant challenges for these methods. Prompt tuning combined with the pre-trained Vision-Language models (VLMs) is an effective solution to the Few-shot Weakly Supervised WSI classification (FSWC) tasks. Nevertheless, applying prompt tuning methods designed for natural images to WSIs presents three significant challenges: 1) These methods fail to fully leverage the prior knowledge from the VLM's text modality; 2) They overlook the essential multi-scale and contextual information in WSIs, leading to suboptimal results; and 3) They lack exploration of instance aggregation methods. To address these problems, we propose a Multi-Scale and Context-focused Prompt Tuning (MSCPT) method for FSWC tasks. Specifically, MSCPT employs the frozen large language model to generate pathological visual language prior knowledge at multi-scale, guiding hierarchical prompt tuning. Additionally, we design a graph prompt tuning module to learn essential contextual information within WSI, and finally, a non-parametric cross-guided instance aggregation module has been introduced to get the WSI-level features. Based on two VLMs, extensive experiments and visualizations on three datasets demonstrated the powerful performance of our MSCPT. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.11505v1-abstract-full').style.display = 'none'; document.getElementById('2408.11505v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 5 figures, 5tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.04806">arXiv:2408.04806</a> <span> [<a href="https://arxiv.org/pdf/2408.04806">pdf</a>, <a href="https://arxiv.org/format/2408.04806">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> When Refreshable Tactile Displays Meet Conversational Agents: Investigating Accessible Data Presentation and Analysis with Touch and Speech </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Reinders%2C+S">Samuel Reinders</a>, <a href="/search/cs?searchtype=author&query=Butler%2C+M">Matthew Butler</a>, <a href="/search/cs?searchtype=author&query=Zukerman%2C+I">Ingrid Zukerman</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+B">Bongshin Lee</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+L">Lizhen Qu</a>, <a href="/search/cs?searchtype=author&query=Marriott%2C+K">Kim Marriott</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.04806v1-abstract-short" style="display: inline;"> Despite the recent surge of research efforts to make data visualizations accessible to people who are blind or have low vision (BLV), how to support BLV people's data analysis remains an important and challenging question. As refreshable tactile displays (RTDs) become cheaper and conversational agents continue to improve, their combination provides a promising approach to support BLV people's inte… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.04806v1-abstract-full').style.display = 'inline'; document.getElementById('2408.04806v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.04806v1-abstract-full" style="display: none;"> Despite the recent surge of research efforts to make data visualizations accessible to people who are blind or have low vision (BLV), how to support BLV people's data analysis remains an important and challenging question. As refreshable tactile displays (RTDs) become cheaper and conversational agents continue to improve, their combination provides a promising approach to support BLV people's interactive data exploration and analysis. To understand how BLV people would use and react to a system combining an RTD with a conversational agent, we conducted a Wizard-of-Oz study with 11 BLV participants, where they interacted with line charts, bar charts, and isarithmic maps. Our analysis of participants' interactions led to the identification of nine distinct patterns. We also learned that the choice of modalities depended on the type of task and prior experience with tactile graphics, and that participants strongly preferred the combination of RTD and speech to a single modality. In addition, participants with more tactile experience described how tactile images facilitated a deeper engagement with the data and supported independent interpretation. Our findings will inform the design of interfaces for such interactive mixed-modality systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.04806v1-abstract-full').style.display = 'none'; document.getElementById('2408.04806v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to be presented at IEEE VIS 2024 (Honorable Mention Award) and published in IEEE TVCG</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.01929">arXiv:2408.01929</a> <span> [<a href="https://arxiv.org/pdf/2408.01929">pdf</a>, <a href="https://arxiv.org/format/2408.01929">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Advancing H&E-to-IHC Stain Translation in Breast Cancer: A Multi-Magnification and Attention-Based Approach </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qu%2C+L">Linhao Qu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chengsheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Guihui Li</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+H">Haiyong Zheng</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+C">Chen Peng</a>, <a href="/search/cs?searchtype=author&query=He%2C+W">Wei He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.01929v1-abstract-short" style="display: inline;"> Breast cancer presents a significant healthcare challenge globally, demanding precise diagnostics and effective treatment strategies, where histopathological examination of Hematoxylin and Eosin (H&E) stained tissue sections plays a central role. Despite its importance, evaluating specific biomarkers like Human Epidermal Growth Factor Receptor 2 (HER2) for personalized treatment remains constraine… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.01929v1-abstract-full').style.display = 'inline'; document.getElementById('2408.01929v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.01929v1-abstract-full" style="display: none;"> Breast cancer presents a significant healthcare challenge globally, demanding precise diagnostics and effective treatment strategies, where histopathological examination of Hematoxylin and Eosin (H&E) stained tissue sections plays a central role. Despite its importance, evaluating specific biomarkers like Human Epidermal Growth Factor Receptor 2 (HER2) for personalized treatment remains constrained by the resource-intensive nature of Immunohistochemistry (IHC). Recent strides in deep learning, particularly in image-to-image translation, offer promise in synthesizing IHC-HER2 slides from H\&E stained slides. However, existing methodologies encounter challenges, including managing multiple magnifications in pathology images and insufficient focus on crucial information during translation. To address these issues, we propose a novel model integrating attention mechanisms and multi-magnification information processing. Our model employs a multi-magnification processing strategy to extract and utilize information from various magnifications within pathology images, facilitating robust image translation. Additionally, an attention module within the generative network prioritizes critical information for image distribution translation while minimizing less pertinent details. Rigorous testing on a publicly available breast cancer dataset demonstrates superior performance compared to existing methods, establishing our model as a state-of-the-art solution in advancing pathology image translation from H&E to IHC staining. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.01929v1-abstract-full').style.display = 'none'; document.getElementById('2408.01929v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by IEEE CIS-RAM 2024 Invited Session Oral</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.21721">arXiv:2407.21721</a> <span> [<a href="https://arxiv.org/pdf/2407.21721">pdf</a>, <a href="https://arxiv.org/format/2407.21721">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Open-Vocabulary Audio-Visual Semantic Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+R">Ruohao Guo</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+L">Liao Qu</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+D">Dantong Niu</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+Y">Yanyu Qi</a>, <a href="/search/cs?searchtype=author&query=Yue%2C+W">Wenzhen Yue</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+J">Ji Shi</a>, <a href="/search/cs?searchtype=author&query=Xing%2C+B">Bowei Xing</a>, <a href="/search/cs?searchtype=author&query=Ying%2C+X">Xianghua Ying</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.21721v1-abstract-short" style="display: inline;"> Audio-visual semantic segmentation (AVSS) aims to segment and classify sounding objects in videos with acoustic cues. However, most approaches operate on the close-set assumption and only identify pre-defined categories from training data, lacking the generalization ability to detect novel categories in practical applications. In this paper, we introduce a new task: open-vocabulary audio-visual se… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.21721v1-abstract-full').style.display = 'inline'; document.getElementById('2407.21721v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.21721v1-abstract-full" style="display: none;"> Audio-visual semantic segmentation (AVSS) aims to segment and classify sounding objects in videos with acoustic cues. However, most approaches operate on the close-set assumption and only identify pre-defined categories from training data, lacking the generalization ability to detect novel categories in practical applications. In this paper, we introduce a new task: open-vocabulary audio-visual semantic segmentation, extending AVSS task to open-world scenarios beyond the annotated label space. This is a more challenging task that requires recognizing all categories, even those that have never been seen nor heard during training. Moreover, we propose the first open-vocabulary AVSS framework, OV-AVSS, which mainly consists of two parts: 1) a universal sound source localization module to perform audio-visual fusion and locate all potential sounding objects and 2) an open-vocabulary classification module to predict categories with the help of the prior knowledge from large-scale pre-trained vision-language models. To properly evaluate the open-vocabulary AVSS, we split zero-shot training and testing subsets based on the AVSBench-semantic benchmark, namely AVSBench-OV. Extensive experiments demonstrate the strong segmentation and zero-shot generalization ability of our model on all categories. On the AVSBench-OV dataset, OV-AVSS achieves 55.43% mIoU on base categories and 29.14% mIoU on novel categories, exceeding the state-of-the-art zero-shot method by 41.88%/20.61% and open-vocabulary method by 10.2%/11.6%. The code is available at https://github.com/ruohaoguo/ovavss. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.21721v1-abstract-full').style.display = 'none'; document.getElementById('2407.21721v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ACM MM 2024 (Oral)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.19638">arXiv:2407.19638</a> <span> [<a href="https://arxiv.org/pdf/2407.19638">pdf</a>, <a href="https://arxiv.org/format/2407.19638">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> From Pre-training Corpora to Large Language Models: What Factors Influence LLM Performance in Causal Discovery Tasks? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Feng%2C+T">Tao Feng</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+L">Lizhen Qu</a>, <a href="/search/cs?searchtype=author&query=Tandon%2C+N">Niket Tandon</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhuang Li</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+X">Xiaoxi Kang</a>, <a href="/search/cs?searchtype=author&query=Haffari%2C+G">Gholamreza Haffari</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.19638v1-abstract-short" style="display: inline;"> Recent advances in artificial intelligence have seen Large Language Models (LLMs) demonstrate notable proficiency in causal discovery tasks. This study explores the factors influencing the performance of LLMs in causal discovery tasks. Utilizing open-source LLMs, we examine how the frequency of causal relations within their pre-training corpora affects their ability to accurately respond to causal… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.19638v1-abstract-full').style.display = 'inline'; document.getElementById('2407.19638v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.19638v1-abstract-full" style="display: none;"> Recent advances in artificial intelligence have seen Large Language Models (LLMs) demonstrate notable proficiency in causal discovery tasks. This study explores the factors influencing the performance of LLMs in causal discovery tasks. Utilizing open-source LLMs, we examine how the frequency of causal relations within their pre-training corpora affects their ability to accurately respond to causal discovery queries. Our findings reveal that a higher frequency of causal mentions correlates with better model performance, suggesting that extensive exposure to causal information during training enhances the models' causal discovery capabilities. Additionally, we investigate the impact of context on the validity of causal relations. Our results indicate that LLMs might exhibit divergent predictions for identical causal relations when presented in different contexts. This paper provides the first comprehensive analysis of how different factors contribute to LLM performance in causal discovery tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.19638v1-abstract-full').style.display = 'none'; document.getElementById('2407.19638v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.17726">arXiv:2407.17726</a> <span> [<a href="https://arxiv.org/pdf/2407.17726">pdf</a>, <a href="https://arxiv.org/format/2407.17726">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Multi-modal Data Binding for Survival Analysis Modeling with Incomplete Data and Annotations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qu%2C+L">Linhao Qu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+D">Dan Huang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shaoting Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiaosong Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.17726v1-abstract-short" style="display: inline;"> Survival analysis stands as a pivotal process in cancer treatment research, crucial for predicting patient survival rates accurately. Recent advancements in data collection techniques have paved the way for enhancing survival predictions by integrating information from multiple modalities. However, real-world scenarios often present challenges with incomplete data, particularly when dealing with c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.17726v1-abstract-full').style.display = 'inline'; document.getElementById('2407.17726v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.17726v1-abstract-full" style="display: none;"> Survival analysis stands as a pivotal process in cancer treatment research, crucial for predicting patient survival rates accurately. Recent advancements in data collection techniques have paved the way for enhancing survival predictions by integrating information from multiple modalities. However, real-world scenarios often present challenges with incomplete data, particularly when dealing with censored survival labels. Prior works have addressed missing modalities but have overlooked incomplete labels, which can introduce bias and limit model efficacy. To bridge this gap, we introduce a novel framework that simultaneously handles incomplete data across modalities and censored survival labels. Our approach employs advanced foundation models to encode individual modalities and align them into a universal representation space for seamless fusion. By generating pseudo labels and incorporating uncertainty, we significantly enhance predictive accuracy. The proposed method demonstrates outstanding prediction accuracy in two survival analysis tasks on both employed datasets. This innovative approach overcomes limitations associated with disparate modalities and improves the feasibility of comprehensive survival analysis using multiple large foundation models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.17726v1-abstract-full').style.display = 'none'; document.getElementById('2407.17726v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by MICCAI 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.17274">arXiv:2407.17274</a> <span> [<a href="https://arxiv.org/pdf/2407.17274">pdf</a>, <a href="https://arxiv.org/format/2407.17274">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Revolutionizing Text-to-Image Retrieval as Autoregressive Token-to-Voken Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yongqi Li</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+H">Hongru Cai</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wenjie Wang</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+L">Leigang Qu</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+Y">Yinwei Wei</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wenjie Li</a>, <a href="/search/cs?searchtype=author&query=Nie%2C+L">Liqiang Nie</a>, <a href="/search/cs?searchtype=author&query=Chua%2C+T">Tat-Seng Chua</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.17274v1-abstract-short" style="display: inline;"> Text-to-image retrieval is a fundamental task in multimedia processing, aiming to retrieve semantically relevant cross-modal content. Traditional studies have typically approached this task as a discriminative problem, matching the text and image via the cross-attention mechanism (one-tower framework) or in a common embedding space (two-tower framework). Recently, generative cross-modal retrieval… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.17274v1-abstract-full').style.display = 'inline'; document.getElementById('2407.17274v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.17274v1-abstract-full" style="display: none;"> Text-to-image retrieval is a fundamental task in multimedia processing, aiming to retrieve semantically relevant cross-modal content. Traditional studies have typically approached this task as a discriminative problem, matching the text and image via the cross-attention mechanism (one-tower framework) or in a common embedding space (two-tower framework). Recently, generative cross-modal retrieval has emerged as a new research line, which assigns images with unique string identifiers and generates the target identifier as the retrieval target. Despite its great potential, existing generative approaches are limited due to the following issues: insufficient visual information in identifiers, misalignment with high-level semantics, and learning gap towards the retrieval target. To address the above issues, we propose an autoregressive voken generation method, named AVG. AVG tokenizes images into vokens, i.e., visual tokens, and innovatively formulates the text-to-image retrieval task as a token-to-voken generation problem. AVG discretizes an image into a sequence of vokens as the identifier of the image, while maintaining the alignment with both the visual information and high-level semantics of the image. Additionally, to bridge the learning gap between generative training and the retrieval target, we incorporate discriminative training to modify the learning direction during token-to-voken training. Extensive experiments demonstrate that AVG achieves superior results in both effectiveness and efficiency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.17274v1-abstract-full').style.display = 'none'; document.getElementById('2407.17274v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Work in progress</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.16154">arXiv:2407.16154</a> <span> [<a href="https://arxiv.org/pdf/2407.16154">pdf</a>, <a href="https://arxiv.org/format/2407.16154">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> DDK: Distilling Domain Knowledge for Efficient Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jiaheng Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chenchen Zhang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+J">Jinyang Guo</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yuanxing Zhang</a>, <a href="/search/cs?searchtype=author&query=Que%2C+H">Haoran Que</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+K">Ken Deng</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+Z">Zhiqi Bai</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jie Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+G">Ge Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jiakai Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yanan Wu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+C">Congnan Liu</a>, <a href="/search/cs?searchtype=author&query=Su%2C+W">Wenbo Su</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jiamang Wang</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+L">Lin Qu</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+B">Bo Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.16154v1-abstract-short" style="display: inline;"> Despite the advanced intelligence abilities of large language models (LLMs) in various applications, they still face significant computational and storage demands. Knowledge Distillation (KD) has emerged as an effective strategy to improve the performance of a smaller LLM (i.e., the student model) by transferring knowledge from a high-performing LLM (i.e., the teacher model). Prevailing techniques… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.16154v1-abstract-full').style.display = 'inline'; document.getElementById('2407.16154v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.16154v1-abstract-full" style="display: none;"> Despite the advanced intelligence abilities of large language models (LLMs) in various applications, they still face significant computational and storage demands. Knowledge Distillation (KD) has emerged as an effective strategy to improve the performance of a smaller LLM (i.e., the student model) by transferring knowledge from a high-performing LLM (i.e., the teacher model). Prevailing techniques in LLM distillation typically use a black-box model API to generate high-quality pretrained and aligned datasets, or utilize white-box distillation by altering the loss function to better transfer knowledge from the teacher LLM. However, these methods ignore the knowledge differences between the student and teacher LLMs across domains. This results in excessive focus on domains with minimal performance gaps and insufficient attention to domains with large gaps, reducing overall performance. In this paper, we introduce a new LLM distillation framework called DDK, which dynamically adjusts the composition of the distillation dataset in a smooth manner according to the domain performance differences between the teacher and student models, making the distillation process more stable and effective. Extensive evaluations show that DDK significantly improves the performance of student models, outperforming both continuously pretrained baselines and existing knowledge distillation methods by a large margin. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.16154v1-abstract-full').style.display = 'none'; document.getElementById('2407.16154v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.15411">arXiv:2407.15411</a> <span> [<a href="https://arxiv.org/pdf/2407.15411">pdf</a>, <a href="https://arxiv.org/format/2407.15411">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Scalable Dynamic Embedding Size Search for Streaming Recommendation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qu%2C+Y">Yunke Qu</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+L">Liang Qu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tong Chen</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+X">Xiangyu Zhao</a>, <a href="/search/cs?searchtype=author&query=Nguyen%2C+Q+V+H">Quoc Viet Hung Nguyen</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+H">Hongzhi Yin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.15411v2-abstract-short" style="display: inline;"> Recommender systems typically represent users and items by learning their embeddings, which are usually set to uniform dimensions and dominate the model parameters. However, real-world recommender systems often operate in streaming recommendation scenarios, where the number of users and items continues to grow, leading to substantial storage resource consumption for these embeddings. Although a fe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.15411v2-abstract-full').style.display = 'inline'; document.getElementById('2407.15411v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.15411v2-abstract-full" style="display: none;"> Recommender systems typically represent users and items by learning their embeddings, which are usually set to uniform dimensions and dominate the model parameters. However, real-world recommender systems often operate in streaming recommendation scenarios, where the number of users and items continues to grow, leading to substantial storage resource consumption for these embeddings. Although a few methods attempt to mitigate this by employing embedding size search strategies to assign different embedding dimensions in streaming recommendations, they assume that the embedding size grows with the frequency of users/items, which eventually still exceeds the predefined memory budget over time. To address this issue, this paper proposes to learn Scalable Lightweight Embeddings for streaming recommendation, called SCALL, which can adaptively adjust the embedding sizes of users/items within a given memory budget over time. Specifically, we propose to sample embedding sizes from a probabilistic distribution, with the guarantee to meet any predefined memory budget. By fixing the memory budget, the proposed embedding size sampling strategy can increase and decrease the embedding sizes in accordance to the frequency of the corresponding users or items. Furthermore, we develop a reinforcement learning-based search paradigm that models each state with mean pooling to keep the length of the state vectors fixed, invariant to the changing number of users and items. As a result, the proposed method can provide embedding sizes to unseen users and items. Comprehensive empirical evaluations on two public datasets affirm the advantageous effectiveness of our proposed method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.15411v2-abstract-full').style.display = 'none'; document.getElementById('2407.15411v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted to CIKM 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.13703">arXiv:2407.13703</a> <span> [<a href="https://arxiv.org/pdf/2407.13703">pdf</a>, <a href="https://arxiv.org/format/2407.13703">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Energy-Efficient Channel Decoding for Wireless Federated Learning: Convergence Analysis and Adaptive Design </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qu%2C+L">Linping Qu</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+Y">Yuyi Mao</a>, <a href="/search/cs?searchtype=author&query=Song%2C+S">Shenghui Song</a>, <a href="/search/cs?searchtype=author&query=Tsui%2C+C">Chi-Ying Tsui</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.13703v3-abstract-short" style="display: inline;"> One of the most critical challenges for deploying distributed learning solutions, such as federated learning (FL), in wireless networks is the limited battery capacity of mobile clients. While it is a common belief that the major energy consumption of mobile clients comes from the uplink data transmission, this paper presents a novel finding, namely channel decoding also contributes significantly… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.13703v3-abstract-full').style.display = 'inline'; document.getElementById('2407.13703v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.13703v3-abstract-full" style="display: none;"> One of the most critical challenges for deploying distributed learning solutions, such as federated learning (FL), in wireless networks is the limited battery capacity of mobile clients. While it is a common belief that the major energy consumption of mobile clients comes from the uplink data transmission, this paper presents a novel finding, namely channel decoding also contributes significantly to the overall energy consumption of mobile clients in FL. Motivated by this new observation, we propose an energy-efficient adaptive channel decoding scheme that leverages the intrinsic robustness of FL to model errors. In particular, the robustness is exploited to reduce the energy consumption of channel decoders at mobile clients by adaptively adjusting the number of decoding iterations. We theoretically prove that wireless FL with communication errors can converge at the same rate as the case with error-free communication provided the bit error rate (BER) is properly constrained. An adaptive channel decoding scheme is then proposed to improve the energy efficiency of wireless FL systems. Experimental results demonstrate that the proposed method maintains the same learning accuracy while reducing the channel decoding energy consumption by ~20% when compared to an existing approach. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.13703v3-abstract-full').style.display = 'none'; document.getElementById('2407.13703v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper has been accepted by the IEEE TWC</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.12611">arXiv:2407.12611</a> <span> [<a href="https://arxiv.org/pdf/2407.12611">pdf</a>, <a href="https://arxiv.org/format/2407.12611">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Deep Mutual Learning among Partially Labeled Datasets for Multi-Organ Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiaoyu Liu</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+L">Linhao Qu</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+Z">Ziyue Xie</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yonghong Shi</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Z">Zhijian Song</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.12611v1-abstract-short" style="display: inline;"> The task of labeling multiple organs for segmentation is a complex and time-consuming process, resulting in a scarcity of comprehensively labeled multi-organ datasets while the emergence of numerous partially labeled datasets. Current methods are inadequate in effectively utilizing the supervised information available from these datasets, thereby impeding the progress in improving the segmentation… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.12611v1-abstract-full').style.display = 'inline'; document.getElementById('2407.12611v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.12611v1-abstract-full" style="display: none;"> The task of labeling multiple organs for segmentation is a complex and time-consuming process, resulting in a scarcity of comprehensively labeled multi-organ datasets while the emergence of numerous partially labeled datasets. Current methods are inadequate in effectively utilizing the supervised information available from these datasets, thereby impeding the progress in improving the segmentation accuracy. This paper proposes a two-stage multi-organ segmentation method based on mutual learning, aiming to improve multi-organ segmentation performance by complementing information among partially labeled datasets. In the first stage, each partial-organ segmentation model utilizes the non-overlapping organ labels from different datasets and the distinct organ features extracted by different models, introducing additional mutual difference learning to generate higher quality pseudo labels for unlabeled organs. In the second stage, each full-organ segmentation model is supervised by fully labeled datasets with pseudo labels and leverages true labels from other datasets, while dynamically sharing accurate features across different models, introducing additional mutual similarity learning to enhance multi-organ segmentation performance. Extensive experiments were conducted on nine datasets that included the head and neck, chest, abdomen, and pelvis. The results indicate that our method has achieved SOTA performance in segmentation tasks that rely on partial labels, and the ablation studies have thoroughly confirmed the efficacy of the mutual learning mechanism. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.12611v1-abstract-full').style.display = 'none'; document.getElementById('2407.12611v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 9 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.10814">arXiv:2407.10814</a> <span> [<a href="https://arxiv.org/pdf/2407.10814">pdf</a>, <a href="https://arxiv.org/format/2407.10814">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Pathology-knowledge Enhanced Multi-instance Prompt Learning for Few-shot Whole Slide Image Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qu%2C+L">Linhao Qu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+D">Dingkang Yang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+D">Dan Huang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Q">Qinhao Guo</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+R">Rongkui Luo</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shaoting Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiaosong Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.10814v1-abstract-short" style="display: inline;"> Current multi-instance learning algorithms for pathology image analysis often require a substantial number of Whole Slide Images for effective training but exhibit suboptimal performance in scenarios with limited learning data. In clinical settings, restricted access to pathology slides is inevitable due to patient privacy concerns and the prevalence of rare or emerging diseases. The emergence of… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.10814v1-abstract-full').style.display = 'inline'; document.getElementById('2407.10814v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.10814v1-abstract-full" style="display: none;"> Current multi-instance learning algorithms for pathology image analysis often require a substantial number of Whole Slide Images for effective training but exhibit suboptimal performance in scenarios with limited learning data. In clinical settings, restricted access to pathology slides is inevitable due to patient privacy concerns and the prevalence of rare or emerging diseases. The emergence of the Few-shot Weakly Supervised WSI Classification accommodates the significant challenge of the limited slide data and sparse slide-level labels for diagnosis. Prompt learning based on the pre-trained models (\eg, CLIP) appears to be a promising scheme for this setting; however, current research in this area is limited, and existing algorithms often focus solely on patch-level prompts or confine themselves to language prompts. This paper proposes a multi-instance prompt learning framework enhanced with pathology knowledge, \ie, integrating visual and textual prior knowledge into prompts at both patch and slide levels. The training process employs a combination of static and learnable prompts, effectively guiding the activation of pre-trained models and further facilitating the diagnosis of key pathology patterns. Lightweight Messenger (self-attention) and Summary (attention-pooling) layers are introduced to model relationships between patches and slides within the same patient data. Additionally, alignment-wise contrastive losses ensure the feature-level alignment between visual and textual learnable prompts for both patches and slides. Our method demonstrates superior performance in three challenging clinical tasks, significantly outperforming comparative few-shot methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.10814v1-abstract-full').style.display = 'none'; document.getElementById('2407.10814v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ECCV 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.04955">arXiv:2407.04955</a> <span> [<a href="https://arxiv.org/pdf/2407.04955">pdf</a>, <a href="https://arxiv.org/format/2407.04955">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Asynchronous Multimodal Video Sequence Fusion via Learning Modality-Exclusive and -Agnostic Representations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+D">Dingkang Yang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Mingcheng Li</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+L">Linhao Qu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+K">Kun Yang</a>, <a href="/search/cs?searchtype=author&query=Zhai%2C+P">Peng Zhai</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Song Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Lihua Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.04955v2-abstract-short" style="display: inline;"> Understanding human intentions (e.g., emotions) from videos has received considerable attention recently. Video streams generally constitute a blend of temporal data stemming from distinct modalities, including natural language, facial expressions, and auditory clues. Despite the impressive advancements of previous works via attention-based paradigms, the inherent temporal asynchrony and modality… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.04955v2-abstract-full').style.display = 'inline'; document.getElementById('2407.04955v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.04955v2-abstract-full" style="display: none;"> Understanding human intentions (e.g., emotions) from videos has received considerable attention recently. Video streams generally constitute a blend of temporal data stemming from distinct modalities, including natural language, facial expressions, and auditory clues. Despite the impressive advancements of previous works via attention-based paradigms, the inherent temporal asynchrony and modality heterogeneity challenges remain in multimodal sequence fusion, causing adverse performance bottlenecks. To tackle these issues, we propose a Multimodal fusion approach for learning modality-Exclusive and modality-Agnostic representations (MEA) to refine multimodal features and leverage the complementarity across distinct modalities. On the one hand, MEA introduces a predictive self-attention module to capture reliable context dynamics within modalities and reinforce unique features over the modality-exclusive spaces. On the other hand, a hierarchical cross-modal attention module is designed to explore valuable element correlations among modalities over the modality-agnostic space. Meanwhile, a double-discriminator strategy is presented to ensure the production of distinct representations in an adversarial manner. Eventually, we propose a decoupled graph fusion mechanism to enhance knowledge exchange across heterogeneous modalities and learn robust multimodal representations for downstream tasks. Numerous experiments are implemented on three multimodal datasets with asynchronous sequences. Systematic analyses show the necessity of our approach. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.04955v2-abstract-full').style.display = 'none'; document.getElementById('2407.04955v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by TCSVT 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.02031">arXiv:2407.02031</a> <span> [<a href="https://arxiv.org/pdf/2407.02031">pdf</a>, <a href="https://arxiv.org/format/2407.02031">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> SwiftDiffusion: Efficient Diffusion Model Serving with Add-on Modules </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+S">Suyi Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+L">Lingyun Yang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+X">Xiaoxiao Jiang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Hanfeng Lu</a>, <a href="/search/cs?searchtype=author&query=Di%2C+Z">Zhipeng Di</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+W">Weiyi Lu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jiawei Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+K">Kan Liu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Y">Yinghao Yu</a>, <a href="/search/cs?searchtype=author&query=Lan%2C+T">Tao Lan</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+G">Guodong Yang</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+L">Lin Qu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Liping Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wei Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.02031v1-abstract-short" style="display: inline;"> This paper documents our characterization study and practices for serving text-to-image requests with stable diffusion models in production. We first comprehensively analyze inference request traces for commercial text-to-image applications. It commences with our observation that add-on modules, i.e., ControlNets and LoRAs, that augment the base stable diffusion models, are ubiquitous in generatin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.02031v1-abstract-full').style.display = 'inline'; document.getElementById('2407.02031v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.02031v1-abstract-full" style="display: none;"> This paper documents our characterization study and practices for serving text-to-image requests with stable diffusion models in production. We first comprehensively analyze inference request traces for commercial text-to-image applications. It commences with our observation that add-on modules, i.e., ControlNets and LoRAs, that augment the base stable diffusion models, are ubiquitous in generating images for commercial applications. Despite their efficacy, these add-on modules incur high loading overhead, prolong the serving latency, and swallow up expensive GPU resources. Driven by our characterization study, we present SwiftDiffusion, a system that efficiently generates high-quality images using stable diffusion models and add-on modules. To achieve this, SwiftDiffusion reconstructs the existing text-to-image serving workflow by identifying the opportunities for parallel computation and distributing ControlNet computations across multiple GPUs. Further, SwiftDiffusion thoroughly analyzes the dynamics of image generation and develops techniques to eliminate the overhead associated with LoRA loading and patching while preserving the image quality. Last, SwiftDiffusion proposes specialized optimizations in the backbone architecture of the stable diffusion models, which are also compatible with the efficient serving of add-on modules. Compared to state-of-the-art text-to-image serving systems, SwiftDiffusion reduces serving latency by up to 5x and improves serving throughput by up to 2x without compromising image quality. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.02031v1-abstract-full').style.display = 'none'; document.getElementById('2407.02031v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.18156">arXiv:2406.18156</a> <span> [<a href="https://arxiv.org/pdf/2406.18156">pdf</a>, <a href="https://arxiv.org/format/2406.18156">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> FedAQ: Communication-Efficient Federated Edge Learning via Joint Uplink and Downlink Adaptive Quantization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qu%2C+L">Linping Qu</a>, <a href="/search/cs?searchtype=author&query=Song%2C+S">Shenghui Song</a>, <a href="/search/cs?searchtype=author&query=Tsui%2C+C">Chi-Ying Tsui</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.18156v1-abstract-short" style="display: inline;"> Federated learning (FL) is a powerful machine learning paradigm which leverages the data as well as the computational resources of clients, while protecting clients' data privacy. However, the substantial model size and frequent aggregation between the server and clients result in significant communication overhead, making it challenging to deploy FL in resource-limited wireless networks. In this… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.18156v1-abstract-full').style.display = 'inline'; document.getElementById('2406.18156v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.18156v1-abstract-full" style="display: none;"> Federated learning (FL) is a powerful machine learning paradigm which leverages the data as well as the computational resources of clients, while protecting clients' data privacy. However, the substantial model size and frequent aggregation between the server and clients result in significant communication overhead, making it challenging to deploy FL in resource-limited wireless networks. In this work, we aim to mitigate the communication overhead by using quantization. Previous research on quantization has primarily focused on the uplink communication, employing either fixed-bit quantization or adaptive quantization methods. In this work, we introduce a holistic approach by joint uplink and downlink adaptive quantization to reduce the communication overhead. In particular, we optimize the learning convergence by determining the optimal uplink and downlink quantization bit-length, with a communication energy constraint. Theoretical analysis shows that the optimal quantization levels depend on the range of model gradients or weights. Based on this insight, we propose a decreasing-trend quantization for the uplink and an increasing-trend quantization for the downlink, which aligns with the change of the model parameters during the training process. Experimental results show that, the proposed joint uplink and downlink adaptive quantization strategy can save up to 66.7% energy compared with the existing schemes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.18156v1-abstract-full').style.display = 'none'; document.getElementById('2406.18156v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This work has been submitted to the IEEE for possible publication</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.17430">arXiv:2406.17430</a> <span> [<a href="https://arxiv.org/pdf/2406.17430">pdf</a>, <a href="https://arxiv.org/format/2406.17430">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Towards Probing Speech-Specific Risks in Large Multimodal Models: A Taxonomy, Benchmark, and Insights </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+H">Hao Yang</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+L">Lizhen Qu</a>, <a href="/search/cs?searchtype=author&query=Shareghi%2C+E">Ehsan Shareghi</a>, <a href="/search/cs?searchtype=author&query=Haffari%2C+G">Gholamreza Haffari</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.17430v1-abstract-short" style="display: inline;"> Large Multimodal Models (LMMs) have achieved great success recently, demonstrating a strong capability to understand multimodal information and to interact with human users. Despite the progress made, the challenge of detecting high-risk interactions in multimodal settings, and in particular in speech modality, remains largely unexplored. Conventional research on risk for speech modality primarily… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.17430v1-abstract-full').style.display = 'inline'; document.getElementById('2406.17430v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.17430v1-abstract-full" style="display: none;"> Large Multimodal Models (LMMs) have achieved great success recently, demonstrating a strong capability to understand multimodal information and to interact with human users. Despite the progress made, the challenge of detecting high-risk interactions in multimodal settings, and in particular in speech modality, remains largely unexplored. Conventional research on risk for speech modality primarily emphasises the content (e.g., what is captured as transcription). However, in speech-based interactions, paralinguistic cues in audio can significantly alter the intended meaning behind utterances. In this work, we propose a speech-specific risk taxonomy, covering 8 risk categories under hostility (malicious sarcasm and threats), malicious imitation (age, gender, ethnicity), and stereotypical biases (age, gender, ethnicity). Based on the taxonomy, we create a small-scale dataset for evaluating current LMMs capability in detecting these categories of risk. We observe even the latest models remain ineffective to detect various paralinguistic-specific risks in speech (e.g., Gemini 1.5 Pro is performing only slightly above random baseline). Warning: this paper contains biased and offensive examples. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.17430v1-abstract-full').style.display = 'none'; document.getElementById('2406.17430v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.17300">arXiv:2406.17300</a> <span> [<a href="https://arxiv.org/pdf/2406.17300">pdf</a>, <a href="https://arxiv.org/format/2406.17300">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> CausalScore: An Automatic Reference-Free Metric for Assessing Response Relevance in Open-Domain Dialogue Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Feng%2C+T">Tao Feng</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+L">Lizhen Qu</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+X">Xiaoxi Kang</a>, <a href="/search/cs?searchtype=author&query=Haffari%2C+G">Gholamreza Haffari</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.17300v1-abstract-short" style="display: inline;"> Automatically evaluating the quality of responses in open-domain dialogue systems is a challenging but crucial task. Current evaluation metrics often fail to align with human judgments, especially when assessing responses that are grammatically correct. To address this issue, we propose a novel metric, called CausalScore, which assesses the relevance of responses by measuring the causal strength b… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.17300v1-abstract-full').style.display = 'inline'; document.getElementById('2406.17300v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.17300v1-abstract-full" style="display: none;"> Automatically evaluating the quality of responses in open-domain dialogue systems is a challenging but crucial task. Current evaluation metrics often fail to align with human judgments, especially when assessing responses that are grammatically correct. To address this issue, we propose a novel metric, called CausalScore, which assesses the relevance of responses by measuring the causal strength between dialogue histories and responses. The causal strength is estimated by utilizing both unconditional dependence and conditional dependencies from the dialogue history to responses. We compare our metric with the existing competitive metrics in terms of their alignment with human judgements. Our experimental results demonstrate that CausalScore significantly surpasses existing state-of-the-art metrics by aligning better with human judgements. Additionally, we collect a new dialogue dataset CGDIALOG+ with human-annotated causal relations and a set of pairwise human judgements to facilitate the development of future automatic metrics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.17300v1-abstract-full').style.display = 'none'; document.getElementById('2406.17300v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.15490">arXiv:2406.15490</a> <span> [<a href="https://arxiv.org/pdf/2406.15490">pdf</a>, <a href="https://arxiv.org/format/2406.15490">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Causal Discovery Inspired Unsupervised Domain Adaptation for Emotion-Cause Pair Extraction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hua%2C+Y">Yuncheng Hua</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yujin Huang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+S">Shuo Huang</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+T">Tao Feng</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+L">Lizhen Qu</a>, <a href="/search/cs?searchtype=author&query=Bain%2C+C">Chris Bain</a>, <a href="/search/cs?searchtype=author&query=Bassed%2C+R">Richard Bassed</a>, <a href="/search/cs?searchtype=author&query=Haffari%2C+G">Gholamreza Haffari</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.15490v1-abstract-short" style="display: inline;"> This paper tackles the task of emotion-cause pair extraction in the unsupervised domain adaptation setting. The problem is challenging as the distributions of the events causing emotions in target domains are dramatically different than those in source domains, despite the distributions of emotional expressions between domains are overlapped. Inspired by causal discovery, we propose a novel deep l… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.15490v1-abstract-full').style.display = 'inline'; document.getElementById('2406.15490v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.15490v1-abstract-full" style="display: none;"> This paper tackles the task of emotion-cause pair extraction in the unsupervised domain adaptation setting. The problem is challenging as the distributions of the events causing emotions in target domains are dramatically different than those in source domains, despite the distributions of emotional expressions between domains are overlapped. Inspired by causal discovery, we propose a novel deep latent model in the variational autoencoder (VAE) framework, which not only captures the underlying latent structures of data but also utilizes the easily transferable knowledge of emotions as the bridge to link the distributions of events in different domains. To facilitate knowledge transfer across domains, we also propose a novel variational posterior regularization technique to disentangle the latent representations of emotions from those of events in order to mitigate the damage caused by the spurious correlations related to the events in source domains. Through extensive experiments, we demonstrate that our model outperforms the strongest baseline by approximately 11.05% on a Chinese benchmark and 2.45% on a English benchmark in terms of weighted-average F1 score. The source code will be publicly available upon acceptance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.15490v1-abstract-full').style.display = 'none'; document.getElementById('2406.15490v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 6 figures, 4 tables; Under Review in EMNLP 2024</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.4 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.13217">arXiv:2406.13217</a> <span> [<a href="https://arxiv.org/pdf/2406.13217">pdf</a>, <a href="https://arxiv.org/format/2406.13217">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Bridging Law and Data: Augmenting Reasoning via a Semi-Structured Dataset with IRAC methodology </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kang%2C+X">Xiaoxi Kang</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+L">Lizhen Qu</a>, <a href="/search/cs?searchtype=author&query=Soon%2C+L">Lay-Ki Soon</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhuang Li</a>, <a href="/search/cs?searchtype=author&query=Trakic%2C+A">Adnan Trakic</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.13217v1-abstract-short" style="display: inline;"> The effectiveness of Large Language Models (LLMs) in legal reasoning is often limited due to the unique legal terminologies and the necessity for highly specialized knowledge. These limitations highlight the need for high-quality data tailored for complex legal reasoning tasks. This paper introduces LEGALSEMI, a benchmark specifically curated for legal scenario analysis. LEGALSEMI comprises 54 leg… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.13217v1-abstract-full').style.display = 'inline'; document.getElementById('2406.13217v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.13217v1-abstract-full" style="display: none;"> The effectiveness of Large Language Models (LLMs) in legal reasoning is often limited due to the unique legal terminologies and the necessity for highly specialized knowledge. These limitations highlight the need for high-quality data tailored for complex legal reasoning tasks. This paper introduces LEGALSEMI, a benchmark specifically curated for legal scenario analysis. LEGALSEMI comprises 54 legal scenarios, each rigorously annotated by legal experts, based on the comprehensive IRAC (Issue, Rule, Application, Conclusion) framework. In addition, LEGALSEMI is accompanied by a structured knowledge graph (SKG). A series of experiments were conducted to assess the usefulness of LEGALSEMI for IRAC analysis. The experimental results demonstrate the effectiveness of incorporating the SKG for issue identification, rule retrieval, application and conclusion generation using four different LLMs. LEGALSEMI will be publicly available upon acceptance of this paper. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.13217v1-abstract-full').style.display = 'none'; document.getElementById('2406.13217v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.10882">arXiv:2406.10882</a> <span> [<a href="https://arxiv.org/pdf/2406.10882">pdf</a>, <a href="https://arxiv.org/format/2406.10882">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> SCAR: Efficient Instruction-Tuning for Large Language Models via Style Consistency-Aware Response Ranking </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhuang Li</a>, <a href="/search/cs?searchtype=author&query=Hua%2C+Y">Yuncheng Hua</a>, <a href="/search/cs?searchtype=author&query=Vu%2C+T">Thuy-Trang Vu</a>, <a href="/search/cs?searchtype=author&query=Zhan%2C+H">Haolan Zhan</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+L">Lizhen Qu</a>, <a href="/search/cs?searchtype=author&query=Haffari%2C+G">Gholamreza Haffari</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.10882v6-abstract-short" style="display: inline;"> Recent studies have shown that maintaining a consistent response style by human experts and enhancing data quality in training sets can significantly improve the performance of fine-tuned Large Language Models (LLMs) while reducing the number of training examples needed. However, the precise definition of style and the relationship between style, data quality, and LLM performance remains unclear.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.10882v6-abstract-full').style.display = 'inline'; document.getElementById('2406.10882v6-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.10882v6-abstract-full" style="display: none;"> Recent studies have shown that maintaining a consistent response style by human experts and enhancing data quality in training sets can significantly improve the performance of fine-tuned Large Language Models (LLMs) while reducing the number of training examples needed. However, the precise definition of style and the relationship between style, data quality, and LLM performance remains unclear. This research identifies two key stylistic elements in responses: linguistic form and semantic surprisal. We find that, among training data of comparable quality, higher consistency in these response elements leads to better LLM performance. Inspired by this, we introduce Style Consistency-Aware Response Ranking (SCAR), which automatically prioritizes instruction-response pairs in the training set based on their response stylistic consistency. By selecting the most style-consistent examples, sometimes as few as 0.7% of the full dataset, the fine-tuned LLMs can match or even surpass the performance of models trained on the entire dataset in coding and open-ended question-answering benchmarks. Code and data are available at https://github.com/zhuang-li/SCAR . <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.10882v6-abstract-full').style.display = 'none'; document.getElementById('2406.10882v6-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">27 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.08811">arXiv:2406.08811</a> <span> [<a href="https://arxiv.org/pdf/2406.08811">pdf</a>, <a href="https://arxiv.org/format/2406.08811">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Mixture-of-Skills: Learning to Optimize Data Usage for Fine-Tuning Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+M">Minghao Wu</a>, <a href="/search/cs?searchtype=author&query=Vu%2C+T">Thuy-Trang Vu</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+L">Lizhen Qu</a>, <a href="/search/cs?searchtype=author&query=Haffari%2C+G">Gholamreza Haffari</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.08811v2-abstract-short" style="display: inline;"> Large language models (LLMs) are typically fine-tuned on diverse and extensive datasets sourced from various origins to develop a comprehensive range of skills, such as writing, reasoning, chatting, coding, and more. Each skill has unique characteristics, and these datasets are often heterogeneous and imbalanced, making the fine-tuning process highly challenging. Balancing the development of each… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.08811v2-abstract-full').style.display = 'inline'; document.getElementById('2406.08811v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.08811v2-abstract-full" style="display: none;"> Large language models (LLMs) are typically fine-tuned on diverse and extensive datasets sourced from various origins to develop a comprehensive range of skills, such as writing, reasoning, chatting, coding, and more. Each skill has unique characteristics, and these datasets are often heterogeneous and imbalanced, making the fine-tuning process highly challenging. Balancing the development of each skill while ensuring the model maintains its overall performance requires sophisticated techniques and careful dataset curation. In this work, we propose a general, model-agnostic, reinforcement learning framework, Mixture-of-Skills (MoS), that learns to optimize data usage automatically during the fine-tuning process. This framework ensures the optimal comprehensive skill development of LLMs by dynamically adjusting the focus on different datasets based on their current learning state. To validate the effectiveness of MoS, we conduct extensive experiments using three diverse LLM backbones on two widely used benchmarks and demonstrate that MoS substantially enhances model performance. Building on the success of MoS, we propose MoSpec, an adaptation for task-specific fine-tuning, which harnesses the utilities of various datasets for a specific purpose. Our work underlines the significance of dataset rebalancing and present MoS as a powerful, general solution for optimizing data usage in the fine-tuning of LLMs for various purposes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.08811v2-abstract-full').style.display = 'none'; document.getElementById('2406.08811v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages, 7 tables, 4 figures; Accepted by EMNLP2024 main</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.05814">arXiv:2406.05814</a> <span> [<a href="https://arxiv.org/pdf/2406.05814">pdf</a>, <a href="https://arxiv.org/format/2406.05814">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> Unified Text-to-Image Generation and Retrieval </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qu%2C+L">Leigang Qu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Haochuan Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+T">Tan Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wenjie Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yongqi Li</a>, <a href="/search/cs?searchtype=author&query=Nie%2C+L">Liqiang Nie</a>, <a href="/search/cs?searchtype=author&query=Chua%2C+T">Tat-Seng Chua</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.05814v1-abstract-short" style="display: inline;"> How humans can efficiently and effectively acquire images has always been a perennial question. A typical solution is text-to-image retrieval from an existing database given the text query; however, the limited database typically lacks creativity. By contrast, recent breakthroughs in text-to-image generation have made it possible to produce fancy and diverse visual content, but it faces challenges… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.05814v1-abstract-full').style.display = 'inline'; document.getElementById('2406.05814v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.05814v1-abstract-full" style="display: none;"> How humans can efficiently and effectively acquire images has always been a perennial question. A typical solution is text-to-image retrieval from an existing database given the text query; however, the limited database typically lacks creativity. By contrast, recent breakthroughs in text-to-image generation have made it possible to produce fancy and diverse visual content, but it faces challenges in synthesizing knowledge-intensive images. In this work, we rethink the relationship between text-to-image generation and retrieval and propose a unified framework in the context of Multimodal Large Language Models (MLLMs). Specifically, we first explore the intrinsic discriminative abilities of MLLMs and introduce a generative retrieval method to perform retrieval in a training-free manner. Subsequently, we unify generation and retrieval in an autoregressive generation way and propose an autonomous decision module to choose the best-matched one between generated and retrieved images as the response to the text query. Additionally, we construct a benchmark called TIGeR-Bench, including creative and knowledge-intensive domains, to standardize the evaluation of unified text-to-image generation and retrieval. Extensive experimental results on TIGeR-Bench and two retrieval benchmarks, i.e., Flickr30K and MS-COCO, demonstrate the superiority and effectiveness of our proposed method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.05814v1-abstract-full').style.display = 'none'; document.getElementById('2406.05814v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.05615">arXiv:2406.05615</a> <span> [<a href="https://arxiv.org/pdf/2406.05615">pdf</a>, <a href="https://arxiv.org/format/2406.05615">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Video-Language Understanding: A Survey from Model Architecture, Model Training, and Data Perspectives </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Nguyen%2C+T">Thong Nguyen</a>, <a href="/search/cs?searchtype=author&query=Bin%2C+Y">Yi Bin</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+J">Junbin Xiao</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+L">Leigang Qu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yicong Li</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J+Z">Jay Zhangjie Wu</a>, <a href="/search/cs?searchtype=author&query=Nguyen%2C+C">Cong-Duy Nguyen</a>, <a href="/search/cs?searchtype=author&query=Ng%2C+S">See-Kiong Ng</a>, <a href="/search/cs?searchtype=author&query=Tuan%2C+L+A">Luu Anh Tuan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.05615v2-abstract-short" style="display: inline;"> Humans use multiple senses to comprehend the environment. Vision and language are two of the most vital senses since they allow us to easily communicate our thoughts and perceive the world around us. There has been a lot of interest in creating video-language understanding systems with human-like senses since a video-language pair can mimic both our linguistic medium and visual environment with te… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.05615v2-abstract-full').style.display = 'inline'; document.getElementById('2406.05615v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.05615v2-abstract-full" style="display: none;"> Humans use multiple senses to comprehend the environment. Vision and language are two of the most vital senses since they allow us to easily communicate our thoughts and perceive the world around us. There has been a lot of interest in creating video-language understanding systems with human-like senses since a video-language pair can mimic both our linguistic medium and visual environment with temporal dynamics. In this survey, we review the key tasks of these systems and highlight the associated challenges. Based on the challenges, we summarize their methods from model architecture, model training, and data perspectives. We also conduct performance comparison among the methods, and discuss promising directions for future research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.05615v2-abstract-full').style.display = 'none'; document.getElementById('2406.05615v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at ACL 2024 (Findings)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.05387">arXiv:2406.05387</a> <span> [<a href="https://arxiv.org/pdf/2406.05387">pdf</a>, <a href="https://arxiv.org/format/2406.05387">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> PTF-FSR: A Parameter Transmission-Free Federated Sequential Recommender System </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yuan%2C+W">Wei Yuan</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+C">Chaoqun Yang</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+L">Liang Qu</a>, <a href="/search/cs?searchtype=author&query=Nguyen%2C+Q+V+H">Quoc Viet Hung Nguyen</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+G">Guanhua Ye</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+H">Hongzhi Yin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.05387v1-abstract-short" style="display: inline;"> Sequential recommender systems have made significant progress. Recently, due to increasing concerns about user data privacy, some researchers have implemented federated learning for sequential recommendation, a.k.a., Federated Sequential Recommender Systems (FedSeqRecs), in which a public sequential recommender model is shared and frequently transmitted between a central server and clients to achi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.05387v1-abstract-full').style.display = 'inline'; document.getElementById('2406.05387v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.05387v1-abstract-full" style="display: none;"> Sequential recommender systems have made significant progress. Recently, due to increasing concerns about user data privacy, some researchers have implemented federated learning for sequential recommendation, a.k.a., Federated Sequential Recommender Systems (FedSeqRecs), in which a public sequential recommender model is shared and frequently transmitted between a central server and clients to achieve collaborative learning. Although these solutions mitigate user privacy to some extent, they present two significant limitations that affect their practical usability: (1) They require a globally shared sequential recommendation model. However, in real-world scenarios, the recommendation model constitutes a critical intellectual property for platform and service providers. Therefore, service providers may be reluctant to disclose their meticulously developed models. (2) The communication costs are high as they correlate with the number of model parameters. This becomes particularly problematic as the current FedSeqRec will be inapplicable when sequential recommendation marches into a large language model era. To overcome the above challenges, this paper proposes a parameter transmission-free federated sequential recommendation framework (PTF-FSR), which ensures both model and data privacy protection to meet the privacy needs of service providers and system users alike. Furthermore, since PTF-FSR only transmits prediction results under privacy protection, which are independent of model sizes, this new federated learning architecture can accommodate more complex and larger sequential recommendation models. Extensive experiments conducted on three widely used recommendation datasets, employing various sequential recommendation models from both ID-based and ID-free paradigms, demonstrate the effectiveness and generalization capability of our proposed framework. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.05387v1-abstract-full').style.display = 'none'; document.getElementById('2406.05387v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.03749">arXiv:2406.03749</a> <span> [<a href="https://arxiv.org/pdf/2406.03749">pdf</a>, <a href="https://arxiv.org/format/2406.03749">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> NAP^2: A Benchmark for Naturalness and Privacy-Preserving Text Rewriting by Learning from Human </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+S">Shuo Huang</a>, <a href="/search/cs?searchtype=author&query=MacLean%2C+W">William MacLean</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+X">Xiaoxi Kang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+A">Anqi Wu</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+L">Lizhen Qu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Q">Qiongkai Xu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhuang Li</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+X">Xingliang Yuan</a>, <a href="/search/cs?searchtype=author&query=Haffari%2C+G">Gholamreza Haffari</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.03749v1-abstract-short" style="display: inline;"> Increasing concerns about privacy leakage issues in academia and industry arise when employing NLP models from third-party providers to process sensitive texts. To protect privacy before sending sensitive data to those models, we suggest sanitizing sensitive text using two common strategies used by humans: i) deleting sensitive expressions, and ii) obscuring sensitive details by abstracting them.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.03749v1-abstract-full').style.display = 'inline'; document.getElementById('2406.03749v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.03749v1-abstract-full" style="display: none;"> Increasing concerns about privacy leakage issues in academia and industry arise when employing NLP models from third-party providers to process sensitive texts. To protect privacy before sending sensitive data to those models, we suggest sanitizing sensitive text using two common strategies used by humans: i) deleting sensitive expressions, and ii) obscuring sensitive details by abstracting them. To explore the issues and develop a tool for text rewriting, we curate the first corpus, coined NAP^2, through both crowdsourcing and the use of large language models (LLMs). Compared to the prior works based on differential privacy, which lead to a sharp drop in information utility and unnatural texts, the human-inspired approaches result in more natural rewrites and offer an improved balance between privacy protection and data utility, as demonstrated by our extensive experiments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.03749v1-abstract-full').style.display = 'none'; document.getElementById('2406.03749v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.01375">arXiv:2406.01375</a> <span> [<a href="https://arxiv.org/pdf/2406.01375">pdf</a>, <a href="https://arxiv.org/format/2406.01375">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> D-CPT Law: Domain-specific Continual Pre-Training Scaling Law for Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Que%2C+H">Haoran Que</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jiaheng Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+G">Ge Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chenchen Zhang</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+X">Xingwei Qu</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Y">Yinghao Ma</a>, <a href="/search/cs?searchtype=author&query=Duan%2C+F">Feiyu Duan</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+Z">Zhiqi Bai</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jiakai Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yuanxing Zhang</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+X">Xu Tan</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+J">Jie Fu</a>, <a href="/search/cs?searchtype=author&query=Su%2C+W">Wenbo Su</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jiamang Wang</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+L">Lin Qu</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+B">Bo Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.01375v1-abstract-short" style="display: inline;"> Continual Pre-Training (CPT) on Large Language Models (LLMs) has been widely used to expand the model's fundamental understanding of specific downstream domains (e.g., math and code). For the CPT on domain-specific LLMs, one important question is how to choose the optimal mixture ratio between the general-corpus (e.g., Dolma, Slim-pajama) and the downstream domain-corpus. Existing methods usually… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.01375v1-abstract-full').style.display = 'inline'; document.getElementById('2406.01375v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.01375v1-abstract-full" style="display: none;"> Continual Pre-Training (CPT) on Large Language Models (LLMs) has been widely used to expand the model's fundamental understanding of specific downstream domains (e.g., math and code). For the CPT on domain-specific LLMs, one important question is how to choose the optimal mixture ratio between the general-corpus (e.g., Dolma, Slim-pajama) and the downstream domain-corpus. Existing methods usually adopt laborious human efforts by grid-searching on a set of mixture ratios, which require high GPU training consumption costs. Besides, we cannot guarantee the selected ratio is optimal for the specific domain. To address the limitations of existing methods, inspired by the Scaling Law for performance prediction, we propose to investigate the Scaling Law of the Domain-specific Continual Pre-Training (D-CPT Law) to decide the optimal mixture ratio with acceptable training costs for LLMs of different sizes. Specifically, by fitting the D-CPT Law, we can easily predict the general and downstream performance of arbitrary mixture ratios, model sizes, and dataset sizes using small-scale training costs on limited experiments. Moreover, we also extend our standard D-CPT Law on cross-domain settings and propose the Cross-Domain D-CPT Law to predict the D-CPT law of target domains, where very small training costs (about 1% of the normal training costs) are needed for the target domains. Comprehensive experimental results on six downstream domains demonstrate the effectiveness and generalizability of our proposed D-CPT Law and Cross-Domain D-CPT Law. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.01375v1-abstract-full').style.display = 'none'; document.getElementById('2406.01375v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.10084">arXiv:2405.10084</a> <span> [<a href="https://arxiv.org/pdf/2405.10084">pdf</a>, <a href="https://arxiv.org/format/2405.10084">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Revisiting Deep Audio-Text Retrieval Through the Lens of Transportation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Luong%2C+M">Manh Luong</a>, <a href="/search/cs?searchtype=author&query=Nguyen%2C+K">Khai Nguyen</a>, <a href="/search/cs?searchtype=author&query=Ho%2C+N">Nhat Ho</a>, <a href="/search/cs?searchtype=author&query=Haf%2C+R">Reza Haf</a>, <a href="/search/cs?searchtype=author&query=Phung%2C+D">Dinh Phung</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+L">Lizhen Qu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.10084v1-abstract-short" style="display: inline;"> The Learning-to-match (LTM) framework proves to be an effective inverse optimal transport approach for learning the underlying ground metric between two sources of data, facilitating subsequent matching. However, the conventional LTM framework faces scalability challenges, necessitating the use of the entire dataset each time the parameters of the ground metric are updated. In adapting LTM to the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.10084v1-abstract-full').style.display = 'inline'; document.getElementById('2405.10084v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.10084v1-abstract-full" style="display: none;"> The Learning-to-match (LTM) framework proves to be an effective inverse optimal transport approach for learning the underlying ground metric between two sources of data, facilitating subsequent matching. However, the conventional LTM framework faces scalability challenges, necessitating the use of the entire dataset each time the parameters of the ground metric are updated. In adapting LTM to the deep learning context, we introduce the mini-batch Learning-to-match (m-LTM) framework for audio-text retrieval problems. This framework leverages mini-batch subsampling and Mahalanobis-enhanced family of ground metrics. Moreover, to cope with misaligned training data in practice, we propose a variant using partial optimal transport to mitigate the harm of misaligned data pairs in training data. We conduct extensive experiments on audio-text matching problems using three datasets: AudioCaps, Clotho, and ESC-50. Results demonstrate that our proposed method is capable of learning rich and expressive joint embedding space, which achieves SOTA performance. Beyond this, the proposed m-LTM framework is able to close the modality gap across audio and text embedding, which surpasses both triplet and contrastive loss in the zero-shot sound event detection task on the ESC-50 dataset. Notably, our strategy of employing partial optimal transport with m-LTM demonstrates greater noise tolerance than contrastive loss, especially under varying noise ratios in training data on the AudioCaps dataset. Our code is available at https://github.com/v-manhlt3/m-LTM-Audio-Text-Retrieval <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.10084v1-abstract-full').style.display = 'none'; document.getElementById('2405.10084v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.06563">arXiv:2405.06563</a> <span> [<a href="https://arxiv.org/pdf/2405.06563">pdf</a>, <a href="https://arxiv.org/format/2405.06563">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> What Can Natural Language Processing Do for Peer Review? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kuznetsov%2C+I">Ilia Kuznetsov</a>, <a href="/search/cs?searchtype=author&query=Afzal%2C+O+M">Osama Mohammed Afzal</a>, <a href="/search/cs?searchtype=author&query=Dercksen%2C+K">Koen Dercksen</a>, <a href="/search/cs?searchtype=author&query=Dycke%2C+N">Nils Dycke</a>, <a href="/search/cs?searchtype=author&query=Goldberg%2C+A">Alexander Goldberg</a>, <a href="/search/cs?searchtype=author&query=Hope%2C+T">Tom Hope</a>, <a href="/search/cs?searchtype=author&query=Hovy%2C+D">Dirk Hovy</a>, <a href="/search/cs?searchtype=author&query=Kummerfeld%2C+J+K">Jonathan K. Kummerfeld</a>, <a href="/search/cs?searchtype=author&query=Lauscher%2C+A">Anne Lauscher</a>, <a href="/search/cs?searchtype=author&query=Leyton-Brown%2C+K">Kevin Leyton-Brown</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+S">Sheng Lu</a>, <a href="/search/cs?searchtype=author&query=Mausam"> Mausam</a>, <a href="/search/cs?searchtype=author&query=Mieskes%2C+M">Margot Mieskes</a>, <a href="/search/cs?searchtype=author&query=N%C3%A9v%C3%A9ol%2C+A">Aur茅lie N茅v茅ol</a>, <a href="/search/cs?searchtype=author&query=Pruthi%2C+D">Danish Pruthi</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+L">Lizhen Qu</a>, <a href="/search/cs?searchtype=author&query=Schwartz%2C+R">Roy Schwartz</a>, <a href="/search/cs?searchtype=author&query=Smith%2C+N+A">Noah A. Smith</a>, <a href="/search/cs?searchtype=author&query=Solorio%2C+T">Thamar Solorio</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jingyan Wang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+X">Xiaodan Zhu</a>, <a href="/search/cs?searchtype=author&query=Rogers%2C+A">Anna Rogers</a>, <a href="/search/cs?searchtype=author&query=Shah%2C+N+B">Nihar B. Shah</a>, <a href="/search/cs?searchtype=author&query=Gurevych%2C+I">Iryna Gurevych</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.06563v1-abstract-short" style="display: inline;"> The number of scientific articles produced every year is growing rapidly. Providing quality control over them is crucial for scientists and, ultimately, for the public good. In modern science, this process is largely delegated to peer review -- a distributed procedure in which each submission is evaluated by several independent experts in the field. Peer review is widely used, yet it is hard, time… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.06563v1-abstract-full').style.display = 'inline'; document.getElementById('2405.06563v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.06563v1-abstract-full" style="display: none;"> The number of scientific articles produced every year is growing rapidly. Providing quality control over them is crucial for scientists and, ultimately, for the public good. In modern science, this process is largely delegated to peer review -- a distributed procedure in which each submission is evaluated by several independent experts in the field. Peer review is widely used, yet it is hard, time-consuming, and prone to error. Since the artifacts involved in peer review -- manuscripts, reviews, discussions -- are largely text-based, Natural Language Processing has great potential to improve reviewing. As the emergence of large language models (LLMs) has enabled NLP assistance for many new tasks, the discussion on machine-assisted peer review is picking up the pace. Yet, where exactly is help needed, where can NLP help, and where should it stand aside? The goal of our paper is to provide a foundation for the future efforts in NLP for peer-reviewing assistance. We discuss peer review as a general process, exemplified by reviewing at AI conferences. We detail each step of the process from manuscript submission to camera-ready revision, and discuss the associated challenges and opportunities for NLP assistance, illustrated by existing work. We then turn to the big challenges in NLP for peer review as a whole, including data acquisition and licensing, operationalization and experimentation, and ethical issues. To help consolidate community efforts, we create a companion repository that aggregates key datasets pertaining to peer review. Finally, we issue a detailed call for action for the scientific community, NLP and AI researchers, policymakers, and funding bodies to help bring the research in NLP for peer review forward. We hope that our work will help set the agenda for research in machine-assisted scientific quality control in the age of AI, within the NLP community and beyond. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.06563v1-abstract-full').style.display = 'none'; document.getElementById('2405.06563v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.15889">arXiv:2404.15889</a> <span> [<a href="https://arxiv.org/pdf/2404.15889">pdf</a>, <a href="https://arxiv.org/format/2404.15889">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> Sketch2Human: Deep Human Generation with Disentangled Geometry and Appearance Control </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qu%2C+L">Linzi Qu</a>, <a href="/search/cs?searchtype=author&query=Shang%2C+J">Jiaxiang Shang</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+H">Hui Ye</a>, <a href="/search/cs?searchtype=author&query=Han%2C+X">Xiaoguang Han</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+H">Hongbo Fu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.15889v1-abstract-short" style="display: inline;"> Geometry- and appearance-controlled full-body human image generation is an interesting but challenging task. Existing solutions are either unconditional or dependent on coarse conditions (e.g., pose, text), thus lacking explicit geometry and appearance control of body and garment. Sketching offers such editing ability and has been adopted in various sketch-based face generation and editing solutio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.15889v1-abstract-full').style.display = 'inline'; document.getElementById('2404.15889v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.15889v1-abstract-full" style="display: none;"> Geometry- and appearance-controlled full-body human image generation is an interesting but challenging task. Existing solutions are either unconditional or dependent on coarse conditions (e.g., pose, text), thus lacking explicit geometry and appearance control of body and garment. Sketching offers such editing ability and has been adopted in various sketch-based face generation and editing solutions. However, directly adapting sketch-based face generation to full-body generation often fails to produce high-fidelity and diverse results due to the high complexity and diversity in the pose, body shape, and garment shape and texture. Recent geometrically controllable diffusion-based methods mainly rely on prompts to generate appearance and it is hard to balance the realism and the faithfulness of their results to the sketch when the input is coarse. This work presents Sketch2Human, the first system for controllable full-body human image generation guided by a semantic sketch (for geometry control) and a reference image (for appearance control). Our solution is based on the latent space of StyleGAN-Human with inverted geometry and appearance latent codes as input. Specifically, we present a sketch encoder trained with a large synthetic dataset sampled from StyleGAN-Human's latent space and directly supervised by sketches rather than real images. Considering the entangled information of partial geometry and texture in StyleGAN-Human and the absence of disentangled datasets, we design a novel training scheme that creates geometry-preserved and appearance-transferred training data to tune a generator to achieve disentangled geometry and appearance control. Although our method is trained with synthetic data, it can handle hand-drawn sketches as well. Qualitative and quantitative evaluations demonstrate the superior performance of our method to state-of-the-art methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.15889v1-abstract-full').style.display = 'none'; document.getElementById('2404.15889v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.15585">arXiv:2404.15585</a> <span> [<a href="https://arxiv.org/pdf/2404.15585">pdf</a>, <a href="https://arxiv.org/format/2404.15585">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Brain Storm Optimization Based Swarm Learning for Diabetic Retinopathy Image Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qu%2C+L">Liang Qu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Cunze Wang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yuhui Shi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.15585v1-abstract-short" style="display: inline;"> The application of deep learning techniques to medical problems has garnered widespread research interest in recent years, such as applying convolutional neural networks to medical image classification tasks. However, data in the medical field is often highly private, preventing different hospitals from sharing data to train an accurate model. Federated learning, as a privacy-preserving machine le… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.15585v1-abstract-full').style.display = 'inline'; document.getElementById('2404.15585v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.15585v1-abstract-full" style="display: none;"> The application of deep learning techniques to medical problems has garnered widespread research interest in recent years, such as applying convolutional neural networks to medical image classification tasks. However, data in the medical field is often highly private, preventing different hospitals from sharing data to train an accurate model. Federated learning, as a privacy-preserving machine learning architecture, has shown promising performance in balancing data privacy and model utility by keeping private data on the client's side and using a central server to coordinate a set of clients for model training through aggregating their uploaded model parameters. Yet, this architecture heavily relies on a trusted third-party server, which is challenging to achieve in real life. Swarm learning, as a specialized decentralized federated learning architecture that does not require a central server, utilizes blockchain technology to enable direct parameter exchanges between clients. However, the mining of blocks requires significant computational resources, limiting its scalability. To address this issue, this paper integrates the brain storm optimization algorithm into the swarm learning framework, named BSO-SL. This approach clusters similar clients into different groups based on their model distributions. Additionally, leveraging the architecture of BSO, clients are given the probability to engage in collaborative learning both within their cluster and with clients outside their cluster, preventing the model from converging to local optima. The proposed method has been validated on a real-world diabetic retinopathy image classification dataset, and the experimental results demonstrate the effectiveness of the proposed approach. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.15585v1-abstract-full').style.display = 'none'; document.getElementById('2404.15585v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Qu%2C+L&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Qu%2C+L&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Qu%2C+L&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Qu%2C+L&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Qu%2C+L&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Qu%2C+L&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>