Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 976 results for author: <span class="mathjax">Jiang, Z</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Jiang%2C+Z">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Jiang, Z"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Jiang%2C+Z&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Jiang, Z"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Jiang%2C+Z&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Jiang%2C+Z&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Jiang%2C+Z&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Jiang%2C+Z&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Jiang%2C+Z&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Jiang%2C+Z&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.08347">arXiv:2502.08347</a> <span> [<a href="https://arxiv.org/pdf/2502.08347">pdf</a>, <a href="https://arxiv.org/format/2502.08347">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Hi-End-MAE: Hierarchical encoder-driven masked autoencoders are stronger vision learners for medical image segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tang%2C+F">Fenghe Tang</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+Q">Qingsong Yao</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+W">Wenxin Ma</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+C">Chenxu Wu</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Zihang Jiang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+S+K">S. Kevin Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.08347v1-abstract-short" style="display: inline;"> Medical image segmentation remains a formidable challenge due to the label scarcity. Pre-training Vision Transformer (ViT) through masked image modeling (MIM) on large-scale unlabeled medical datasets presents a promising solution, providing both computational efficiency and model generalization for various downstream tasks. However, current ViT-based MIM pre-training frameworks predominantly emph… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08347v1-abstract-full').style.display = 'inline'; document.getElementById('2502.08347v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.08347v1-abstract-full" style="display: none;"> Medical image segmentation remains a formidable challenge due to the label scarcity. Pre-training Vision Transformer (ViT) through masked image modeling (MIM) on large-scale unlabeled medical datasets presents a promising solution, providing both computational efficiency and model generalization for various downstream tasks. However, current ViT-based MIM pre-training frameworks predominantly emphasize local aggregation representations in output layers and fail to exploit the rich representations across different ViT layers that better capture fine-grained semantic information needed for more precise medical downstream tasks. To fill the above gap, we hereby present Hierarchical Encoder-driven MAE (Hi-End-MAE), a simple yet effective ViT-based pre-training solution, which centers on two key innovations: (1) Encoder-driven reconstruction, which encourages the encoder to learn more informative features to guide the reconstruction of masked patches; and (2) Hierarchical dense decoding, which implements a hierarchical decoding structure to capture rich representations across different layers. We pre-train Hi-End-MAE on a large-scale dataset of 10K CT scans and evaluated its performance across seven public medical image segmentation benchmarks. Extensive experiments demonstrate that Hi-End-MAE achieves superior transfer learning capabilities across various downstream tasks, revealing the potential of ViT in medical imaging applications. The code is available at: https://github.com/FengheTan9/Hi-End-MAE <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08347v1-abstract-full').style.display = 'none'; document.getElementById('2502.08347v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">19 pages, Code: https://github.com/FengheTan9/Hi-End-MAE</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.08200">arXiv:2502.08200</a> <span> [<a href="https://arxiv.org/pdf/2502.08200">pdf</a>, <a href="https://arxiv.org/format/2502.08200">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ActiveSSF: An Active-Learning-Guided Self-Supervised Framework for Long-Tailed Megakaryocyte Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhuang%2C+L">Linghao Zhuang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Ying Zhang</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+G">Gege Yuan</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+X">Xingyue Zhao</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Zhiping Jiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.08200v1-abstract-short" style="display: inline;"> Precise classification of megakaryocytes is crucial for diagnosing myelodysplastic syndromes. Although self-supervised learning has shown promise in medical image analysis, its application to classifying megakaryocytes in stained slides faces three main challenges: (1) pervasive background noise that obscures cellular details, (2) a long-tailed distribution that limits data for rare subtypes, and… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08200v1-abstract-full').style.display = 'inline'; document.getElementById('2502.08200v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.08200v1-abstract-full" style="display: none;"> Precise classification of megakaryocytes is crucial for diagnosing myelodysplastic syndromes. Although self-supervised learning has shown promise in medical image analysis, its application to classifying megakaryocytes in stained slides faces three main challenges: (1) pervasive background noise that obscures cellular details, (2) a long-tailed distribution that limits data for rare subtypes, and (3) complex morphological variations leading to high intra-class variability. To address these issues, we propose the ActiveSSF framework, which integrates active learning with self-supervised pretraining. Specifically, our approach employs Gaussian filtering combined with K-means clustering and HSV analysis (augmented by clinical prior knowledge) for accurate region-of-interest extraction; an adaptive sample selection mechanism that dynamically adjusts similarity thresholds to mitigate class imbalance; and prototype clustering on labeled samples to overcome morphological complexity. Experimental results on clinical megakaryocyte datasets demonstrate that ActiveSSF not only achieves state-of-the-art performance but also significantly improves recognition accuracy for rare subtypes. Moreover, the integration of these advanced techniques further underscores the practical potential of ActiveSSF in clinical settings. To foster further research, the code and datasets will be publicly released in the future. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08200v1-abstract-full').style.display = 'none'; document.getElementById('2502.08200v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">6 pages, submitted to EMBC 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05911">arXiv:2502.05911</a> <span> [<a href="https://arxiv.org/pdf/2502.05911">pdf</a>, <a href="https://arxiv.org/format/2502.05911">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> GRAIT: Gradient-Driven Refusal-Aware Instruction Tuning for Effective Hallucination Mitigation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+R">Runchuan Zhu</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Zinco Jiang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jiang Wu</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Z">Zhipeng Ma</a>, <a href="/search/cs?searchtype=author&query=Song%2C+J">Jiahe Song</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+F">Fengshuo Bai</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+D">Dahua Lin</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+L">Lijun Wu</a>, <a href="/search/cs?searchtype=author&query=He%2C+C">Conghui He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05911v1-abstract-short" style="display: inline;"> Refusal-Aware Instruction Tuning (RAIT) aims to enhance Large Language Models (LLMs) by improving their ability to refuse responses to questions beyond their knowledge, thereby reducing hallucinations and improving reliability. Effective RAIT must address two key challenges: firstly, effectively reject unknown questions to minimize hallucinations; secondly, avoid over-refusal to ensure questions t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05911v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05911v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05911v1-abstract-full" style="display: none;"> Refusal-Aware Instruction Tuning (RAIT) aims to enhance Large Language Models (LLMs) by improving their ability to refuse responses to questions beyond their knowledge, thereby reducing hallucinations and improving reliability. Effective RAIT must address two key challenges: firstly, effectively reject unknown questions to minimize hallucinations; secondly, avoid over-refusal to ensure questions that can be correctly answered are not rejected, thereby maintain the helpfulness of LLM outputs. In this paper, we address the two challenges by deriving insightful observations from the gradient-based perspective, and proposing the Gradient-driven Refusal Aware Instruction Tuning Framework GRAIT: (1) employs gradient-driven sample selection to effectively minimize hallucinations and (2) introduces an adaptive weighting mechanism during fine-tuning to reduce the risk of over-refusal, achieving the balance between accurate refusals and maintaining useful responses. Experimental evaluations on open-ended and multiple-choice question answering tasks demonstrate that GRAIT significantly outperforms existing RAIT methods in the overall performance. The source code and data will be available at https://github.com/opendatalab/GRAIT . <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05911v1-abstract-full').style.display = 'none'; document.getElementById('2502.05911v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Equal contribution: Runchuan Zhu, Zinco Jiang, Jiang Wu; Corresponding author: Conghui He</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05878">arXiv:2502.05878</a> <span> [<a href="https://arxiv.org/pdf/2502.05878">pdf</a>, <a href="https://arxiv.org/format/2502.05878">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Financial Time-Series Forecasting with Retrieval-Augmented Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiao%2C+M">Mengxi Xiao</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Zihao Jiang</a>, <a href="/search/cs?searchtype=author&query=Qian%2C+L">Lingfei Qian</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhengyu Chen</a>, <a href="/search/cs?searchtype=author&query=He%2C+Y">Yueru He</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yijing Xu</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Y">Yuecheng Jiang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Dong Li</a>, <a href="/search/cs?searchtype=author&query=Weng%2C+R">Ruey-Ling Weng</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+M">Min Peng</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jimin Huang</a>, <a href="/search/cs?searchtype=author&query=Ananiadou%2C+S">Sophia Ananiadou</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+Q">Qianqian Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05878v2-abstract-short" style="display: inline;"> Stock movement prediction, a critical task in financial time-series forecasting, relies on identifying and retrieving key influencing factors from vast and complex datasets. However, traditional text-trained or numeric similarity-based retrieval methods often struggle to handle the intricacies of financial data. To address this, we propose the first retrieval-augmented generation (RAG) framework s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05878v2-abstract-full').style.display = 'inline'; document.getElementById('2502.05878v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05878v2-abstract-full" style="display: none;"> Stock movement prediction, a critical task in financial time-series forecasting, relies on identifying and retrieving key influencing factors from vast and complex datasets. However, traditional text-trained or numeric similarity-based retrieval methods often struggle to handle the intricacies of financial data. To address this, we propose the first retrieval-augmented generation (RAG) framework specifically designed for financial time-series forecasting. Our framework incorporates three key innovations: a fine-tuned 1B large language model (StockLLM) as its backbone, a novel candidate selection method enhanced by LLM feedback, and a training objective that maximizes the similarity between queries and historically significant sequences. These advancements enable our retriever, FinSeer, to uncover meaningful patterns while effectively minimizing noise in complex financial datasets. To support robust evaluation, we also construct new datasets that integrate financial indicators and historical stock prices. Experimental results demonstrate that our RAG framework outperforms both the baseline StockLLM and random retrieval methods, showcasing its effectiveness. FinSeer, as the retriever, achieves an 8% higher accuracy on the BIGDATA22 benchmark and retrieves more impactful sequences compared to existing retrieval methods. This work highlights the importance of tailored retrieval models in financial forecasting and provides a novel, scalable framework for future research in the field. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05878v2-abstract-full').style.display = 'none'; document.getElementById('2502.05878v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 4 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05622">arXiv:2502.05622</a> <span> [<a href="https://arxiv.org/pdf/2502.05622">pdf</a>, <a href="https://arxiv.org/format/2502.05622">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> </div> </div> <p class="title is-5 mathjax"> Social inequality and cultural factors impact the awareness and reaction during the cryptic transmission period of pandemic </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Zhuoren Jiang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiaozhong Liu</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+Y">Yangyang Kang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+C">Changlong Sun</a>, <a href="/search/cs?searchtype=author&query=Ahn%2C+Y">Yong-Yeol Ahn</a>, <a href="/search/cs?searchtype=author&query=Bollen%2C+J">Johan Bollen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05622v1-abstract-short" style="display: inline;"> The World Health Organization (WHO) declared the COVID-19 outbreak a Public Health Emergency of International Concern (PHEIC) on January 31, 2020. However, rumors of a "mysterious virus" had already been circulating in China in December 2019, possibly preceding the first confirmed COVID-19 case. Understanding how awareness about an emerging pandemic spreads through society is vital not only for en… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05622v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05622v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05622v1-abstract-full" style="display: none;"> The World Health Organization (WHO) declared the COVID-19 outbreak a Public Health Emergency of International Concern (PHEIC) on January 31, 2020. However, rumors of a "mysterious virus" had already been circulating in China in December 2019, possibly preceding the first confirmed COVID-19 case. Understanding how awareness about an emerging pandemic spreads through society is vital not only for enhancing disease surveillance, but also for mitigating demand shocks and social inequities, such as shortages of personal protective equipment (PPE) and essential supplies. Here we leverage a massive e-commerce dataset comprising 150 billion online queries and purchase records from 94 million people to detect the traces of early awareness and public response during the cryptic transmission period of COVID-19. Our analysis focuses on identifying information gaps across different demographic cohorts, revealing significant social inequities and the role of cultural factors in shaping awareness diffusion and response behaviors. By modeling awareness diffusion in heterogeneous social networks and analyzing online shopping behavior, we uncover the evolving characteristics of vulnerable populations. Our findings expand the theoretical understanding of awareness spread and social inequality in the early stages of a pandemic, highlighting the critical importance of e-commerce data and social network data in effectively and timely addressing future pandemic challenges. We also provide actionable recommendations to better manage and mitigate dynamic social inequalities in public health crises. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05622v1-abstract-full').style.display = 'none'; document.getElementById('2502.05622v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">It has been accepted by PNAS Nexus and will be available online as an open-access publication soon</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05471">arXiv:2502.05471</a> <span> [<a href="https://arxiv.org/pdf/2502.05471">pdf</a>, <a href="https://arxiv.org/format/2502.05471">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Expressive Voice Conversion with Discrete Pitch-Conditioned Flow Matching Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zuo%2C+J">Jialong Zuo</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+S">Shengpeng Ji</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+M">Minghui Fang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Ziyue Jiang</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+X">Xize Cheng</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Q">Qian Yang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+W">Wenrui Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+G">Guangyan Zhang</a>, <a href="/search/cs?searchtype=author&query=Tu%2C+Z">Zehai Tu</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Y">Yiwen Guo</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Zhou Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05471v1-abstract-short" style="display: inline;"> This paper introduces PFlow-VC, a conditional flow matching voice conversion model that leverages fine-grained discrete pitch tokens and target speaker prompt information for expressive voice conversion (VC). Previous VC works primarily focus on speaker conversion, with further exploration needed in enhancing expressiveness (such as prosody and emotion) for timbre conversion. Unlike previous metho… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05471v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05471v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05471v1-abstract-full" style="display: none;"> This paper introduces PFlow-VC, a conditional flow matching voice conversion model that leverages fine-grained discrete pitch tokens and target speaker prompt information for expressive voice conversion (VC). Previous VC works primarily focus on speaker conversion, with further exploration needed in enhancing expressiveness (such as prosody and emotion) for timbre conversion. Unlike previous methods, we adopt a simple and efficient approach to enhance the style expressiveness of voice conversion models. Specifically, we pretrain a self-supervised pitch VQVAE model to discretize speaker-irrelevant pitch information and leverage a masked pitch-conditioned flow matching model for Mel-spectrogram synthesis, which provides in-context pitch modeling capabilities for the speaker conversion model, effectively improving the voice style transfer capacity. Additionally, we improve timbre similarity by combining global timbre embeddings with time-varying timbre tokens. Experiments on unseen LibriTTS test-clean and emotional speech dataset ESD show the superiority of the PFlow-VC model in both timbre conversion and style transfer. Audio samples are available on the demo page https://speechai-demo.github.io/PFlow-VC/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05471v1-abstract-full').style.display = 'none'; document.getElementById('2502.05471v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICASSP 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05053">arXiv:2502.05053</a> <span> [<a href="https://arxiv.org/pdf/2502.05053">pdf</a>, <a href="https://arxiv.org/format/2502.05053">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Gaze-Guided Robotic Vascular Ultrasound Leveraging Human Intention Estimation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bi%2C+Y">Yuan Bi</a>, <a href="/search/cs?searchtype=author&query=Su%2C+Y">Yang Su</a>, <a href="/search/cs?searchtype=author&query=Navab%2C+N">Nassir Navab</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Zhongliang Jiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05053v1-abstract-short" style="display: inline;"> Medical ultrasound has been widely used to examine vascular structure in modern clinical practice. However, traditional ultrasound examination often faces challenges related to inter- and intra-operator variation. The robotic ultrasound system (RUSS) appears as a potential solution for such challenges because of its superiority in stability and reproducibility. Given the complex anatomy of human v… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05053v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05053v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05053v1-abstract-full" style="display: none;"> Medical ultrasound has been widely used to examine vascular structure in modern clinical practice. However, traditional ultrasound examination often faces challenges related to inter- and intra-operator variation. The robotic ultrasound system (RUSS) appears as a potential solution for such challenges because of its superiority in stability and reproducibility. Given the complex anatomy of human vasculature, multiple vessels often appear in ultrasound images, or a single vessel bifurcates into branches, complicating the examination process. To tackle this challenge, this work presents a gaze-guided RUSS for vascular applications. A gaze tracker captures the eye movements of the operator. The extracted gaze signal guides the RUSS to follow the correct vessel when it bifurcates. Additionally, a gaze-guided segmentation network is proposed to enhance segmentation robustness by exploiting gaze information. However, gaze signals are often noisy, requiring interpretation to accurately discern the operator's true intentions. To this end, this study proposes a stabilization module to process raw gaze data. The inferred attention heatmap is utilized as a region proposal to aid segmentation and serve as a trigger signal when the operator needs to adjust the scanning target, such as when a bifurcation appears. To ensure appropriate contact between the probe and surface during scanning, an automatic ultrasound confidence-based orientation correction method is developed. In experiments, we demonstrated the efficiency of the proposed gaze-guided segmentation pipeline by comparing it with other methods. Besides, the performance of the proposed gaze-guided RUSS was also validated as a whole on a realistic arm phantom with an uneven surface. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05053v1-abstract-full').style.display = 'none'; document.getElementById('2502.05053v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.03862">arXiv:2502.03862</a> <span> [<a href="https://arxiv.org/pdf/2502.03862">pdf</a>, <a href="https://arxiv.org/format/2502.03862">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3706598.3714189">10.1145/3706598.3714189 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Enhancing Deliberativeness: Evaluating the Impact of Multimodal Reflection Nudges </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yeo%2C+S">ShunYi Yeo</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Zhuoqun Jiang</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+A">Anthony Tang</a>, <a href="/search/cs?searchtype=author&query=Perrault%2C+S+T">Simon Tangi Perrault</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.03862v2-abstract-short" style="display: inline;"> Nudging participants with text-based reflective nudges enhances deliberation quality on online deliberation platforms. The effectiveness of multimodal reflective nudges, however, remains largely unexplored. Given the multi-sensory nature of human perception, incorporating diverse modalities into self-reflection mechanisms has the potential to better support various reflective styles. This paper ex… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03862v2-abstract-full').style.display = 'inline'; document.getElementById('2502.03862v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.03862v2-abstract-full" style="display: none;"> Nudging participants with text-based reflective nudges enhances deliberation quality on online deliberation platforms. The effectiveness of multimodal reflective nudges, however, remains largely unexplored. Given the multi-sensory nature of human perception, incorporating diverse modalities into self-reflection mechanisms has the potential to better support various reflective styles. This paper explores how presenting reflective nudges of different types (direct: persona and indirect: storytelling) in different modalities (text, image, video and audio) affects deliberation quality. We conducted two user studies with 20 and 200 participants respectively. The first study identifies the preferred modality for each type of reflective nudges, revealing that text is most preferred for persona and video is most preferred for storytelling. The second study assesses the impact of these modalities on deliberation quality. Our findings reveal distinct effects associated with each modality, providing valuable insights for developing more inclusive and effective online deliberation platforms. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03862v2-abstract-full').style.display = 'none'; document.getElementById('2502.03862v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CHI 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.02315">arXiv:2502.02315</a> <span> [<a href="https://arxiv.org/pdf/2502.02315">pdf</a>, <a href="https://arxiv.org/format/2502.02315">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> VaiBot: Shuttle Between the Instructions and Parameters of Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+W">Wangtao Sun</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+H">Haotian Xu</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+H">Huanxuan Liao</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+X">Xuanqing Yu</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Zhongtao Jiang</a>, <a href="/search/cs?searchtype=author&query=He%2C+S">Shizhu He</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+J">Jun Zhao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+K">Kang Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.02315v2-abstract-short" style="display: inline;"> How to interact with LLMs through \emph{instructions} has been widely studied by researchers. However, previous studies have treated the emergence of instructions and the training of LLMs on task data as separate processes, overlooking the inherent unity between the two. This paper proposes a neural network framework, VaiBot, that integrates VAE and VIB, designed to uniformly model, learn, and inf… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02315v2-abstract-full').style.display = 'inline'; document.getElementById('2502.02315v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.02315v2-abstract-full" style="display: none;"> How to interact with LLMs through \emph{instructions} has been widely studied by researchers. However, previous studies have treated the emergence of instructions and the training of LLMs on task data as separate processes, overlooking the inherent unity between the two. This paper proposes a neural network framework, VaiBot, that integrates VAE and VIB, designed to uniformly model, learn, and infer both deduction and induction tasks under LLMs. Through experiments, we demonstrate that VaiBot performs on par with existing baseline methods in terms of deductive capabilities while significantly surpassing them in inductive capabilities. We also find that VaiBot can scale up using general instruction-following data and exhibits excellent one-shot induction abilities. We finally synergistically integrate the deductive and inductive processes of VaiBot. Through T-SNE dimensionality reduction, we observe that its inductive-deductive process significantly improves the distribution of training parameters, enabling it to outperform baseline methods in inductive reasoning tasks. The code and data for this paper can be found at https://anonymous.4open.science/r/VaiBot-021F. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02315v2-abstract-full').style.display = 'none'; document.getElementById('2502.02315v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.01317">arXiv:2502.01317</a> <span> [<a href="https://arxiv.org/pdf/2502.01317">pdf</a>, <a href="https://arxiv.org/format/2502.01317">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> DietGlance: Dietary Monitoring and Personalized Analysis at a Glance with Knowledge-Empowered AI Assistant </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Zhihan Jiang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+R">Running Zhao</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+L">Lin Lin</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Y">Yue Yu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Handi Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xinchen Zhang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xuhai Xu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yifang Wang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+X">Xiaojuan Ma</a>, <a href="/search/cs?searchtype=author&query=Ngai%2C+E+C+H">Edith C. H. Ngai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.01317v1-abstract-short" style="display: inline;"> Growing awareness of wellness has prompted people to consider whether their dietary patterns align with their health and fitness goals. In response, researchers have introduced various wearable dietary monitoring systems and dietary assessment approaches. However, these solutions are either limited to identifying foods with simple ingredients or insufficient in providing analysis of individual die… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01317v1-abstract-full').style.display = 'inline'; document.getElementById('2502.01317v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.01317v1-abstract-full" style="display: none;"> Growing awareness of wellness has prompted people to consider whether their dietary patterns align with their health and fitness goals. In response, researchers have introduced various wearable dietary monitoring systems and dietary assessment approaches. However, these solutions are either limited to identifying foods with simple ingredients or insufficient in providing analysis of individual dietary behaviors with domain-specific knowledge. In this paper, we present DietGlance, a system that automatically monitors dietary in daily routines and delivers personalized analysis from knowledge sources. DietGlance first detects ingestive episodes from multimodal inputs using eyeglasses, capturing privacy-preserving meal images of various dishes being consumed. Based on the inferred food items and consumed quantities from these images, DietGlance further provides nutritional analysis and personalized dietary suggestions, empowered by the retrieval augmentation generation module on a reliable nutrition library. A short-term user study (N=33) and a four-week longitudinal study (N=16) demonstrate the usability and effectiveness of DietGlance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01317v1-abstract-full').style.display = 'none'; document.getElementById('2502.01317v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.00684">arXiv:2502.00684</a> <span> [<a href="https://arxiv.org/pdf/2502.00684">pdf</a>, <a href="https://arxiv.org/format/2502.00684">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Compositional Concept-Based Neuron-Level Interpretability for Deep Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Zeyu Jiang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+H">Hai Huang</a>, <a href="/search/cs?searchtype=author&query=Zuo%2C+X">Xingquan Zuo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.00684v1-abstract-short" style="display: inline;"> Deep reinforcement learning (DRL), through learning policies or values represented by neural networks, has successfully addressed many complex control problems. However, the neural networks introduced by DRL lack interpretability and transparency. Current DRL interpretability methods largely treat neural networks as black boxes, with few approaches delving into the internal mechanisms of policy/va… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00684v1-abstract-full').style.display = 'inline'; document.getElementById('2502.00684v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.00684v1-abstract-full" style="display: none;"> Deep reinforcement learning (DRL), through learning policies or values represented by neural networks, has successfully addressed many complex control problems. However, the neural networks introduced by DRL lack interpretability and transparency. Current DRL interpretability methods largely treat neural networks as black boxes, with few approaches delving into the internal mechanisms of policy/value networks. This limitation undermines trust in both the neural network models that represent policies and the explanations derived from them. In this work, we propose a novel concept-based interpretability method that provides fine-grained explanations of DRL models at the neuron level. Our method formalizes atomic concepts as binary functions over the state space and constructs complex concepts through logical operations. By analyzing the correspondence between neuron activations and concept functions, we establish interpretable explanations for individual neurons in policy/value networks. Experimental results on both continuous control tasks and discrete decision-making environments demonstrate that our method can effectively identify meaningful concepts that align with human understanding while faithfully reflecting the network's decision-making logic. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00684v1-abstract-full').style.display = 'none'; document.getElementById('2502.00684v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 3 figures, IJCAI 2025</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.6; I.2.1; I.2.4 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.00026">arXiv:2502.00026</a> <span> [<a href="https://arxiv.org/pdf/2502.00026">pdf</a>, <a href="https://arxiv.org/format/2502.00026">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Pushing the Limits of BFP on Narrow Precision LLM Inference </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hui Wang</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+Y">Yuan Cheng</a>, <a href="/search/cs?searchtype=author&query=Han%2C+X">Xiaomeng Han</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Zhengpeng Zhao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+D">Dawei Yang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Zhe Jiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.00026v2-abstract-short" style="display: inline;"> The substantial computational and memory demands of Large Language Models (LLMs) hinder their deployment. Block Floating Point (BFP) has proven effective in accelerating linear operations, a cornerstone of LLM workloads. However, as sequence lengths grow, nonlinear operations, such as Attention, increasingly become performance bottlenecks due to their quadratic computational complexity. These nonl… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00026v2-abstract-full').style.display = 'inline'; document.getElementById('2502.00026v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.00026v2-abstract-full" style="display: none;"> The substantial computational and memory demands of Large Language Models (LLMs) hinder their deployment. Block Floating Point (BFP) has proven effective in accelerating linear operations, a cornerstone of LLM workloads. However, as sequence lengths grow, nonlinear operations, such as Attention, increasingly become performance bottlenecks due to their quadratic computational complexity. These nonlinear operations are predominantly executed using inefficient floating-point formats, which renders the system challenging to optimize software efficiency and hardware overhead. In this paper, we delve into the limitations and potential of applying BFP to nonlinear operations. Given our findings, we introduce a hardware-software co-design framework (DB-Attn), including: (i) DBFP, an advanced BFP version, overcomes nonlinear operation challenges with a pivot-focus strategy for diverse data and an adaptive grouping strategy for flexible exponent sharing. (ii) DH-LUT, a novel lookup table algorithm dedicated to accelerating nonlinear operations with DBFP format. (iii) An RTL-level DBFP-based engine is implemented to support DB-Attn, applicable to FPGA and ASIC. Results show that DB-Attn provides significant performance improvements with negligible accuracy loss, achieving 74% GPU speedup on Softmax of LLaMA and 10x low overhead performance improvement over SOTA designs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00026v2-abstract-full').style.display = 'none'; document.getElementById('2502.00026v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.18880">arXiv:2501.18880</a> <span> [<a href="https://arxiv.org/pdf/2501.18880">pdf</a>, <a href="https://arxiv.org/format/2501.18880">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> RLS3: RL-Based Synthetic Sample Selection to Enhance Spatial Reasoning in Vision-Language Models for Indoor Autonomous Perception </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Waite%2C+J+R">Joshua R. Waite</a>, <a href="/search/cs?searchtype=author&query=Hasan%2C+M+Z">Md. Zahid Hasan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qisai Liu</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Zhanhong Jiang</a>, <a href="/search/cs?searchtype=author&query=Hegde%2C+C">Chinmay Hegde</a>, <a href="/search/cs?searchtype=author&query=Sarkar%2C+S">Soumik Sarkar</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.18880v1-abstract-short" style="display: inline;"> Vision-language model (VLM) fine-tuning for application-specific visual grounding based on natural language instructions has become one of the most popular approaches for learning-enabled autonomous systems. However, such fine-tuning relies heavily on high-quality datasets to achieve successful performance in various downstream tasks. Additionally, VLMs often encounter limitations due to insuffici… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.18880v1-abstract-full').style.display = 'inline'; document.getElementById('2501.18880v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.18880v1-abstract-full" style="display: none;"> Vision-language model (VLM) fine-tuning for application-specific visual grounding based on natural language instructions has become one of the most popular approaches for learning-enabled autonomous systems. However, such fine-tuning relies heavily on high-quality datasets to achieve successful performance in various downstream tasks. Additionally, VLMs often encounter limitations due to insufficient and imbalanced fine-tuning data. To address these issues, we propose a new generalizable framework to improve VLM fine-tuning by integrating it with a reinforcement learning (RL) agent. Our method utilizes the RL agent to manipulate objects within an indoor setting to create synthetic data for fine-tuning to address certain vulnerabilities of the VLM. Specifically, we use the performance of the VLM to provide feedback to the RL agent to generate informative data that efficiently fine-tune the VLM over the targeted task (e.g. spatial reasoning). The key contribution of this work is developing a framework where the RL agent serves as an informative data sampling tool and assists the VLM in order to enhance performance and address task-specific vulnerabilities. By targeting the data sampling process to address the weaknesses of the VLM, we can effectively train a more context-aware model. In addition, generating synthetic data allows us to have precise control over each scene and generate granular ground truth captions. Our results show that the proposed data generation approach improves the spatial reasoning performance of VLMs, which demonstrates the benefits of using RL-guided data generation in vision-language tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.18880v1-abstract-full').style.display = 'none'; document.getElementById('2501.18880v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICCPS 2025 accepted paper, 10 pages, 9 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.16551">arXiv:2501.16551</a> <span> [<a href="https://arxiv.org/pdf/2501.16551">pdf</a>, <a href="https://arxiv.org/format/2501.16551">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> PackDiT: Joint Human Motion and Text Generation via Mutual Prompting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Zhongyu Jiang</a>, <a href="/search/cs?searchtype=author&query=Chai%2C+W">Wenhao Chai</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Zhuoran Zhou</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+C">Cheng-Yen Yang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+H">Hsiang-Wei Huang</a>, <a href="/search/cs?searchtype=author&query=Hwang%2C+J">Jenq-Neng Hwang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.16551v1-abstract-short" style="display: inline;"> Human motion generation has advanced markedly with the advent of diffusion models. Most recent studies have concentrated on generating motion sequences based on text prompts, commonly referred to as text-to-motion generation. However, the bidirectional generation of motion and text, enabling tasks such as motion-to-text alongside text-to-motion, has been largely unexplored. This capability is esse… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16551v1-abstract-full').style.display = 'inline'; document.getElementById('2501.16551v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.16551v1-abstract-full" style="display: none;"> Human motion generation has advanced markedly with the advent of diffusion models. Most recent studies have concentrated on generating motion sequences based on text prompts, commonly referred to as text-to-motion generation. However, the bidirectional generation of motion and text, enabling tasks such as motion-to-text alongside text-to-motion, has been largely unexplored. This capability is essential for aligning diverse modalities and supports unconditional generation. In this paper, we introduce PackDiT, the first diffusion-based generative model capable of performing various tasks simultaneously, including motion generation, motion prediction, text generation, text-to-motion, motion-to-text, and joint motion-text generation. Our core innovation leverages mutual blocks to integrate multiple diffusion transformers (DiTs) across different modalities seamlessly. We train PackDiT on the HumanML3D dataset, achieving state-of-the-art text-to-motion performance with an FID score of 0.106, along with superior results in motion prediction and in-between tasks. Our experiments further demonstrate that diffusion models are effective for motion-to-text generation, achieving performance comparable to that of autoregressive models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16551v1-abstract-full').style.display = 'none'; document.getElementById('2501.16551v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.16388">arXiv:2501.16388</a> <span> [<a href="https://arxiv.org/pdf/2501.16388">pdf</a>, <a href="https://arxiv.org/format/2501.16388">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Applications">stat.AP</span> </div> </div> <p class="title is-5 mathjax"> Development and Validation of a Dynamic Kidney Failure Prediction Model based on Deep Learning: A Real-World Study with External Validation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ma%2C+J">Jingying Ma</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jinwei Wang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+L">Lanlan Lu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yexiang Sun</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+M">Mengling Feng</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+P">Peng Shen</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Zhiqin Jiang</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+S">Shenda Hong</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Luxia Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.16388v1-abstract-short" style="display: inline;"> Background: Chronic kidney disease (CKD), a progressive disease with high morbidity and mortality, has become a significant global public health problem. At present, most of the models used for predicting the progression of CKD are static models. We aim to develop a dynamic kidney failure prediction model based on deep learning (KFDeep) for CKD patients, utilizing all available data on common clin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16388v1-abstract-full').style.display = 'inline'; document.getElementById('2501.16388v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.16388v1-abstract-full" style="display: none;"> Background: Chronic kidney disease (CKD), a progressive disease with high morbidity and mortality, has become a significant global public health problem. At present, most of the models used for predicting the progression of CKD are static models. We aim to develop a dynamic kidney failure prediction model based on deep learning (KFDeep) for CKD patients, utilizing all available data on common clinical indicators from real-world Electronic Health Records (EHRs) to provide real-time predictions. Findings: A retrospective cohort of 4,587 patients from EHRs of Yinzhou, China, is used as the development dataset (2,752 patients for training, 917 patients for validation) and internal validation dataset (917 patients), while a prospective cohort of 934 patients from the Peking University First Hospital CKD cohort (PKUFH cohort) is used as the external validation dataset. The AUROC of the KFDeep model reaches 0.946 (95\% CI: 0.922-0.970) on the internal validation dataset and 0.805 (95\% CI: 0.763-0.847) on the external validation dataset, both surpassing existing models. The KFDeep model demonstrates stable performance in simulated dynamic scenarios, with the AUROC progressively increasing over time. Both the calibration curve and decision curve analyses confirm that the model is unbiased and safe for practical use, while the SHAP analysis and hidden layer clustering results align with established medical knowledge. Interpretation: The KFDeep model built from real-world EHRs enhances the prediction accuracy of kidney failure without increasing clinical examination costs and can be easily integrated into existing hospital systems, providing physicians with a continuously updated decision-support tool due to its dynamic design. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16388v1-abstract-full').style.display = 'none'; document.getElementById('2501.16388v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.14268">arXiv:2501.14268</a> <span> [<a href="https://arxiv.org/pdf/2501.14268">pdf</a>, <a href="https://arxiv.org/format/2501.14268">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Pre-train and Fine-tune: Recommenders as Large Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Zhenhao Jiang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C">Chenghao Chen</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+H">Hao Feng</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yu Yang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jin Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jie Zhang</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+J">Jia Jia</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+N">Ning Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.14268v1-abstract-short" style="display: inline;"> In reality, users have different interests in different periods, regions, scenes, etc. Such changes in interest are so drastic that they are difficult to be captured by recommenders. Existing multi-domain learning can alleviate this problem. However, the structure of the industrial recommendation system is complex, the amount of data is huge, and the training cost is extremely high, so it is diffi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.14268v1-abstract-full').style.display = 'inline'; document.getElementById('2501.14268v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.14268v1-abstract-full" style="display: none;"> In reality, users have different interests in different periods, regions, scenes, etc. Such changes in interest are so drastic that they are difficult to be captured by recommenders. Existing multi-domain learning can alleviate this problem. However, the structure of the industrial recommendation system is complex, the amount of data is huge, and the training cost is extremely high, so it is difficult to modify the structure of the industrial recommender and re-train it. To fill this gap, we consider recommenders as large pre-trained models and fine-tune them. We first propose the theory of the information bottleneck for fine-tuning and present an explanation for the fine-tuning technique in recommenders. To tailor for recommendation, we design an information-aware adaptive kernel (IAK) technique to fine-tune the pre-trained recommender. Specifically, we define fine-tuning as two phases: knowledge compression and knowledge matching and let the training stage of IAK explicitly approximate these two phases. Our proposed approach designed from the essence of fine-tuning is well interpretable. Extensive online and offline experiments show the superiority of our proposed method. Besides, we also share unique and important lessons we learned when deploying the method in a large-scale online platform. We also present the potential issues of fine-tuning techniques in recommendation systems and the corresponding solutions. The recommender with IAK technique has been deployed on the homepage of a billion-scale online food platform for several months and has yielded considerable profits in our business. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.14268v1-abstract-full').style.display = 'none'; document.getElementById('2501.14268v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by WWW2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.13987">arXiv:2501.13987</a> <span> [<a href="https://arxiv.org/pdf/2501.13987">pdf</a>, <a href="https://arxiv.org/format/2501.13987">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> OstQuant: Refining Large Language Model Quantization with Orthogonal and Scaling Transformations for Better Distribution Fitting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+X">Xing Hu</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+Y">Yuan Cheng</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+D">Dawei Yang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zukang Xu</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Z">Zhihang Yuan</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+J">Jiangyong Yu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+C">Chen Xu</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Zhe Jiang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+S">Sifan Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.13987v1-abstract-short" style="display: inline;"> Post-training quantization (PTQ) has emerged as a widely adopted technique for compressing and accelerating Large Language Models (LLMs). The major challenge in LLM quantization is that uneven and heavy-tailed data distributions can expand the quantization range, thereby reducing bit precision for most values. Recent methods attempt to eliminate outliers and balance inter-channel differences by em… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13987v1-abstract-full').style.display = 'inline'; document.getElementById('2501.13987v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.13987v1-abstract-full" style="display: none;"> Post-training quantization (PTQ) has emerged as a widely adopted technique for compressing and accelerating Large Language Models (LLMs). The major challenge in LLM quantization is that uneven and heavy-tailed data distributions can expand the quantization range, thereby reducing bit precision for most values. Recent methods attempt to eliminate outliers and balance inter-channel differences by employing linear transformations; however, they remain heuristic and are often overlook optimizing the data distribution across the entire quantization space.In this paper, we introduce Quantization Space Utilization Rate (QSUR), a novel metric that effectively assesses the quantizability of transformed data by measuring the space utilization of the data in the quantization space. We complement QSUR with mathematical derivations that examine the effects and limitations of various transformations, guiding our development of Orthogonal and Scaling Transformation-based Quantization (OSTQuant). OSQuant employs a learnable equivalent transformation, consisting of an orthogonal transformation and a scaling transformation, to optimize the distributions of weights and activations across the entire quantization space. Futhermore, we propose the KL-Top loss function, designed to mitigate noise during optimization while retaining richer semantic information within the limited calibration data imposed by PTQ. OSTQuant outperforms existing work on various LLMs and benchmarks. In the W4-only setting, it retains 99.5\% of the floating-point accuracy. In the more challenging W4A4KV4 configuration, OSTQuant reduces the performance gap by 32\% on the LLaMA-3-8B model compared to state-of-the-art methods. \href{https://github.com/BrotherHappy/OSTQuant}{https://github.com/BrotherHappy/OSTQuant}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13987v1-abstract-full').style.display = 'none'; document.getElementById('2501.13987v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 Pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.13514">arXiv:2501.13514</a> <span> [<a href="https://arxiv.org/pdf/2501.13514">pdf</a>, <a href="https://arxiv.org/format/2501.13514">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Self-Supervised Diffusion MRI Denoising via Iterative and Stable Refinement </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+C">Chenxu Wu</a>, <a href="/search/cs?searchtype=author&query=Kong%2C+Q">Qingpeng Kong</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Zihang Jiang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+S+K">S. Kevin Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.13514v1-abstract-short" style="display: inline;"> Magnetic Resonance Imaging (MRI), including diffusion MRI (dMRI), serves as a ``microscope'' for anatomical structures and routinely mitigates the influence of low signal-to-noise ratio scans by compromising temporal or spatial resolution. However, these compromises fail to meet clinical demands for both efficiency and precision. Consequently, denoising is a vital preprocessing step, particularly… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13514v1-abstract-full').style.display = 'inline'; document.getElementById('2501.13514v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.13514v1-abstract-full" style="display: none;"> Magnetic Resonance Imaging (MRI), including diffusion MRI (dMRI), serves as a ``microscope'' for anatomical structures and routinely mitigates the influence of low signal-to-noise ratio scans by compromising temporal or spatial resolution. However, these compromises fail to meet clinical demands for both efficiency and precision. Consequently, denoising is a vital preprocessing step, particularly for dMRI, where clean data is unavailable. In this paper, we introduce Di-Fusion, a fully self-supervised denoising method that leverages the latter diffusion steps and an adaptive sampling process. Unlike previous approaches, our single-stage framework achieves efficient and stable training without extra noise model training and offers adaptive and controllable results in the sampling process. Our thorough experiments on real and simulated data demonstrate that Di-Fusion achieves state-of-the-art performance in microstructure modeling, tractography tracking, and other downstream tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13514v1-abstract-full').style.display = 'none'; document.getElementById('2501.13514v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">39pages, 34figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> ICLR 2025 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.13484">arXiv:2501.13484</a> <span> [<a href="https://arxiv.org/pdf/2501.13484">pdf</a>, <a href="https://arxiv.org/format/2501.13484">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> MambaQuant: Quantizing the Mamba Family with Variance Aligned Rotation Methods </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zukang Xu</a>, <a href="/search/cs?searchtype=author&query=Yue%2C+Y">Yuxuan Yue</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+X">Xing Hu</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Z">Zhihang Yuan</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Zixu Jiang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhixuan Chen</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+J">Jiangyong Yu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+C">Chen Xu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+S">Sifan Zhou</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+D">Dawei Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.13484v2-abstract-short" style="display: inline;"> Mamba is an efficient sequence model that rivals Transformers and demonstrates significant potential as a foundational architecture for various tasks. Quantization is commonly used in neural networks to reduce model size and computational latency. However, applying quantization to Mamba remains underexplored, and existing quantization methods, which have been effective for CNN and Transformer mode… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13484v2-abstract-full').style.display = 'inline'; document.getElementById('2501.13484v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.13484v2-abstract-full" style="display: none;"> Mamba is an efficient sequence model that rivals Transformers and demonstrates significant potential as a foundational architecture for various tasks. Quantization is commonly used in neural networks to reduce model size and computational latency. However, applying quantization to Mamba remains underexplored, and existing quantization methods, which have been effective for CNN and Transformer models, appear inadequate for Mamba models (e.g., Quarot suffers a 21% accuracy drop on Vim-T$^\dagger$ even under W8A8). We have pioneered the exploration of this issue and identified several key challenges. First, significant outliers are present in gate projections, output projections, and matrix multiplications. Second, Mamba's unique parallel scan further amplifies these outliers, leading to uneven and heavy-tailed data distributions. Third, even with the application of the Hadamard transform, the variance across channels in weights and activations still remains inconsistent. To these ends, we propose MambaQuant, a post-training quantization (PTQ) framework consisting of: 1) Karhunen-Loeve Transformation (KLT) enhanced rotation, rendering the rotation matrix adaptable to diverse channel distributions. 2) Smooth-Fused rotation, which equalizes channel variances and can merge additional parameters into model weights. Experiments show that MambaQuant can quantize both weights and activations into 8-bit with less than 1% accuracy loss for Mamba-based vision and language tasks. To the best of our knowledge, MambaQuant is the first comprehensive PTQ design for the Mamba family, paving the way for further advancements in its application. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13484v2-abstract-full').style.display = 'none'; document.getElementById('2501.13484v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.12295">arXiv:2501.12295</a> <span> [<a href="https://arxiv.org/pdf/2501.12295">pdf</a>, <a href="https://arxiv.org/format/2501.12295">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Towards Accurate Unified Anomaly Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ma%2C+W">Wenxin Ma</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+Q">Qingsong Yao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiang Zhang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Z">Zhelong Huang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Zihang Jiang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+S+K">S. Kevin Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.12295v1-abstract-short" style="display: inline;"> Unsupervised anomaly detection (UAD) from images strives to model normal data distributions, creating discriminative representations to distinguish and precisely localize anomalies. Despite recent advancements in the efficient and unified one-for-all scheme, challenges persist in accurately segmenting anomalies for further monitoring. Moreover, this problem is obscured by the widely-used AUROC met… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12295v1-abstract-full').style.display = 'inline'; document.getElementById('2501.12295v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.12295v1-abstract-full" style="display: none;"> Unsupervised anomaly detection (UAD) from images strives to model normal data distributions, creating discriminative representations to distinguish and precisely localize anomalies. Despite recent advancements in the efficient and unified one-for-all scheme, challenges persist in accurately segmenting anomalies for further monitoring. Moreover, this problem is obscured by the widely-used AUROC metric under imbalanced UAD settings. This motivates us to emphasize the significance of precise segmentation of anomaly pixels using pAP and DSC as metrics. To address the unsolved segmentation task, we introduce the Unified Anomaly Segmentation (UniAS). UniAS presents a multi-level hybrid pipeline that progressively enhances normal information from coarse to fine, incorporating a novel multi-granularity gated CNN (MGG-CNN) into Transformer layers to explicitly aggregate local details from different granularities. UniAS achieves state-of-the-art anomaly segmentation performance, attaining 65.12/59.33 and 40.06/32.50 in pAP/DSC on the MVTec-AD and VisA datasets, respectively, surpassing previous methods significantly. The codes are shared at https://github.com/Mwxinnn/UniAS. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12295v1-abstract-full').style.display = 'none'; document.getElementById('2501.12295v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.11353">arXiv:2501.11353</a> <span> [<a href="https://arxiv.org/pdf/2501.11353">pdf</a>, <a href="https://arxiv.org/format/2501.11353">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> </div> </div> <p class="title is-5 mathjax"> Accelerating Data Access for Single Node in Distributed Storage Systems via MDS Codes </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shi%2C+H">Hao Shi</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Zhengyi Jiang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Z">Zhongyi Huang</a>, <a href="/search/cs?searchtype=author&query=Song%2C+L">Linqi Song</a>, <a href="/search/cs?searchtype=author&query=Hou%2C+H">Hanxu Hou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.11353v1-abstract-short" style="display: inline;"> Maximum distance separable (MDS) array codes are widely employed in modern distributed storage systems to provide high data reliability with small storage overhead. Compared with the data access latency of the entire file, the data access latency of a single node in a distributed storage system is equally important. In this paper, we propose two algorithms to effectively reduce the data access lat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.11353v1-abstract-full').style.display = 'inline'; document.getElementById('2501.11353v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.11353v1-abstract-full" style="display: none;"> Maximum distance separable (MDS) array codes are widely employed in modern distributed storage systems to provide high data reliability with small storage overhead. Compared with the data access latency of the entire file, the data access latency of a single node in a distributed storage system is equally important. In this paper, we propose two algorithms to effectively reduce the data access latency on a single node in different scenarios for MDS codes. We show theoretically that our algorithms have an expected reduction ratio of $\frac{(n-k)(n-k+1)}{n(n+1)}$ and $\frac{n-k}{n}$ for the data access latency of a single node when it obeys uniform distribution and shifted-exponential distribution, respectively, where $n$ and $k$ are the numbers of all nodes and the number of data nodes respectively. In the worst-case analysis, we show that our algorithms have a reduction ratio of more than $60\%$ when $(n,k)=(3,2)$. Furthermore, in simulation experiments, we use the Monte Carlo simulation algorithm to demonstrate less data access latency compared with the baseline algorithm. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.11353v1-abstract-full').style.display = 'none'; document.getElementById('2501.11353v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.10761">arXiv:2501.10761</a> <span> [<a href="https://arxiv.org/pdf/2501.10761">pdf</a>, <a href="https://arxiv.org/format/2501.10761">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Infrared and Visible Image Fusion: From Data Compatibility to Task Adaption </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jinyuan Liu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+G">Guanyao Wu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhu Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+D">Di Wang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Zhiying Jiang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+L">Long Ma</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+W">Wei Zhong</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+X">Xin Fan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+R">Risheng Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.10761v1-abstract-short" style="display: inline;"> Infrared-visible image fusion (IVIF) is a critical task in computer vision, aimed at integrating the unique features of both infrared and visible spectra into a unified representation. Since 2018, the field has entered the deep learning era, with an increasing variety of approaches introducing a range of networks and loss functions to enhance visual performance. However, challenges such as data co… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.10761v1-abstract-full').style.display = 'inline'; document.getElementById('2501.10761v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.10761v1-abstract-full" style="display: none;"> Infrared-visible image fusion (IVIF) is a critical task in computer vision, aimed at integrating the unique features of both infrared and visible spectra into a unified representation. Since 2018, the field has entered the deep learning era, with an increasing variety of approaches introducing a range of networks and loss functions to enhance visual performance. However, challenges such as data compatibility, perception accuracy, and efficiency remain. Unfortunately, there is a lack of recent comprehensive surveys that address this rapidly expanding domain. This paper fills that gap by providing a thorough survey covering a broad range of topics. We introduce a multi-dimensional framework to elucidate common learning-based IVIF methods, from visual enhancement strategies to data compatibility and task adaptability. We also present a detailed analysis of these approaches, accompanied by a lookup table clarifying their core ideas. Furthermore, we summarize performance comparisons, both quantitatively and qualitatively, focusing on registration, fusion, and subsequent high-level tasks. Beyond technical analysis, we discuss potential future directions and open issues in this area. For further details, visit our GitHub repository: https://github.com/RollingPlain/IVIF_ZOO. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.10761v1-abstract-full').style.display = 'none'; document.getElementById('2501.10761v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.10702">arXiv:2501.10702</a> <span> [<a href="https://arxiv.org/pdf/2501.10702">pdf</a>, <a href="https://arxiv.org/format/2501.10702">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Emerging Technologies">cs.ET</span> </div> </div> <p class="title is-5 mathjax"> A RRAM_based In_Memory Computing Architecture for Binary Matrix_Vector Multiplication </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yue%2C+H">Hao Yue</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yihao Chen</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Zhelong Jiang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhigang Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+G">Gang Chen</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Huaxiang Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.10702v3-abstract-short" style="display: inline;"> Binary matrix-vector multiplication, as one of the core operations in many post-quantum cryptography (PQC) schemes, has found widespread applications in the field of information encryption. However, the transmission bottleneck of large matrices between the computation and memory modules leads to significant data movement power consumption and high storage bandwidth demands. In this paper, a partit… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.10702v3-abstract-full').style.display = 'inline'; document.getElementById('2501.10702v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.10702v3-abstract-full" style="display: none;"> Binary matrix-vector multiplication, as one of the core operations in many post-quantum cryptography (PQC) schemes, has found widespread applications in the field of information encryption. However, the transmission bottleneck of large matrices between the computation and memory modules leads to significant data movement power consumption and high storage bandwidth demands. In this paper, a partitioned RRAM-based in-memory computing (IMC) architecture is proposed, aiming to overcome the data transmission bottleneck and efficiently perform binary field matrix-vector multiplication. Experimental results demonstrate the proposed in-memory computing architecture... <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.10702v3-abstract-full').style.display = 'none'; document.getElementById('2501.10702v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.08682">arXiv:2501.08682</a> <span> [<a href="https://arxiv.org/pdf/2501.08682">pdf</a>, <a href="https://arxiv.org/format/2501.08682">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> RealVVT: Towards Photorealistic Video Virtual Try-on via Spatio-Temporal Consistency </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+S">Siqi Li</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Zhengkai Jiang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jiawei Zhou</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhihong Liu</a>, <a href="/search/cs?searchtype=author&query=Chi%2C+X">Xiaowei Chi</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haoqian Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.08682v1-abstract-short" style="display: inline;"> Virtual try-on has emerged as a pivotal task at the intersection of computer vision and fashion, aimed at digitally simulating how clothing items fit on the human body. Despite notable progress in single-image virtual try-on (VTO), current methodologies often struggle to preserve a consistent and authentic appearance of clothing across extended video sequences. This challenge arises from the compl… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.08682v1-abstract-full').style.display = 'inline'; document.getElementById('2501.08682v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.08682v1-abstract-full" style="display: none;"> Virtual try-on has emerged as a pivotal task at the intersection of computer vision and fashion, aimed at digitally simulating how clothing items fit on the human body. Despite notable progress in single-image virtual try-on (VTO), current methodologies often struggle to preserve a consistent and authentic appearance of clothing across extended video sequences. This challenge arises from the complexities of capturing dynamic human pose and maintaining target clothing characteristics. We leverage pre-existing video foundation models to introduce RealVVT, a photoRealistic Video Virtual Try-on framework tailored to bolster stability and realism within dynamic video contexts. Our methodology encompasses a Clothing & Temporal Consistency strategy, an Agnostic-guided Attention Focus Loss mechanism to ensure spatial consistency, and a Pose-guided Long Video VTO technique adept at handling extended video sequences.Extensive experiments across various datasets confirms that our approach outperforms existing state-of-the-art models in both single-image and video VTO tasks, offering a viable solution for practical applications within the realms of fashion e-commerce and virtual fitting environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.08682v1-abstract-full').style.display = 'none'; document.getElementById('2501.08682v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages (8 pages main text, 2 pages references), 5 figures in the main text, and 4 pages supplementary materials with 3 additional figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68T99 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.08313">arXiv:2501.08313</a> <span> [<a href="https://arxiv.org/pdf/2501.08313">pdf</a>, <a href="https://arxiv.org/format/2501.08313">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MiniMax-01: Scaling Foundation Models with Lightning Attention </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=MiniMax"> MiniMax</a>, <a href="/search/cs?searchtype=author&query=Li%2C+A">Aonian Li</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+B">Bangwei Gong</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+B">Bo Yang</a>, <a href="/search/cs?searchtype=author&query=Shan%2C+B">Boji Shan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+C">Chang Liu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+C">Cheng Zhu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chunhao Zhang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+C">Congchao Guo</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+D">Da Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Dong Li</a>, <a href="/search/cs?searchtype=author&query=Jiao%2C+E">Enwei Jiao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Gengxin Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+G">Guojun Zhang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+H">Haohai Sun</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+H">Houze Dong</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jiadai Zhu</a>, <a href="/search/cs?searchtype=author&query=Zhuang%2C+J">Jiaqi Zhuang</a>, <a href="/search/cs?searchtype=author&query=Song%2C+J">Jiayuan Song</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jin Zhu</a>, <a href="/search/cs?searchtype=author&query=Han%2C+J">Jingtao Han</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jingyang Li</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+J">Junbin Xie</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Junhao Xu</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+J">Junjie Yan</a> , et al. (65 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.08313v1-abstract-short" style="display: inline;"> We introduce MiniMax-01 series, including MiniMax-Text-01 and MiniMax-VL-01, which are comparable to top-tier models while offering superior capabilities in processing longer contexts. The core lies in lightning attention and its efficient scaling. To maximize computational capacity, we integrate it with Mixture of Experts (MoE), creating a model with 32 experts and 456 billion total parameters, o… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.08313v1-abstract-full').style.display = 'inline'; document.getElementById('2501.08313v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.08313v1-abstract-full" style="display: none;"> We introduce MiniMax-01 series, including MiniMax-Text-01 and MiniMax-VL-01, which are comparable to top-tier models while offering superior capabilities in processing longer contexts. The core lies in lightning attention and its efficient scaling. To maximize computational capacity, we integrate it with Mixture of Experts (MoE), creating a model with 32 experts and 456 billion total parameters, of which 45.9 billion are activated for each token. We develop an optimized parallel strategy and highly efficient computation-communication overlap techniques for MoE and lightning attention. This approach enables us to conduct efficient training and inference on models with hundreds of billions of parameters across contexts spanning millions of tokens. The context window of MiniMax-Text-01 can reach up to 1 million tokens during training and extrapolate to 4 million tokens during inference at an affordable cost. Our vision-language model, MiniMax-VL-01 is built through continued training with 512 billion vision-language tokens. Experiments on both standard and in-house benchmarks show that our models match the performance of state-of-the-art models like GPT-4o and Claude-3.5-Sonnet while offering 20-32 times longer context window. We publicly release MiniMax-01 at https://github.com/MiniMax-AI. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.08313v1-abstract-full').style.display = 'none'; document.getElementById('2501.08313v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">A technical report from MiniMax. The authors are listed in alphabetical order. We open-sourced our MiniMax-01 at https://github.com/MiniMax-AI</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.06832">arXiv:2501.06832</a> <span> [<a href="https://arxiv.org/pdf/2501.06832">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> </div> <p class="title is-5 mathjax"> A novel multi-agent dynamic portfolio optimization learning system based on hierarchical deep reinforcement learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+R">Ruoyu Sun</a>, <a href="/search/cs?searchtype=author&query=Xi%2C+Y">Yue Xi</a>, <a href="/search/cs?searchtype=author&query=Stefanidis%2C+A">Angelos Stefanidis</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Zhengyong Jiang</a>, <a href="/search/cs?searchtype=author&query=Su%2C+J">Jionglong Su</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.06832v1-abstract-short" style="display: inline;"> Deep Reinforcement Learning (DRL) has been extensively used to address portfolio optimization problems. The DRL agents acquire knowledge and make decisions through unsupervised interactions with their environment without requiring explicit knowledge of the joint dynamics of portfolio assets. Among these DRL algorithms, the combination of actor-critic algorithms and deep function approximators is t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06832v1-abstract-full').style.display = 'inline'; document.getElementById('2501.06832v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.06832v1-abstract-full" style="display: none;"> Deep Reinforcement Learning (DRL) has been extensively used to address portfolio optimization problems. The DRL agents acquire knowledge and make decisions through unsupervised interactions with their environment without requiring explicit knowledge of the joint dynamics of portfolio assets. Among these DRL algorithms, the combination of actor-critic algorithms and deep function approximators is the most widely used DRL algorithm. Here, we find that training the DRL agent using the actor-critic algorithm and deep function approximators may lead to scenarios where the improvement in the DRL agent's risk-adjusted profitability is not significant. We propose that such situations primarily arise from the following two problems: sparsity in positive reward and the curse of dimensionality. These limitations prevent DRL agents from comprehensively learning asset price change patterns in the training environment. As a result, the DRL agents cannot explore the dynamic portfolio optimization policy to improve the risk-adjusted profitability in the training process. To address these problems, we propose a novel multi-agent Hierarchical Deep Reinforcement Learning (HDRL) algorithmic framework in this research. Under this framework, the agents work together as a learning system for portfolio optimization. Specifically, by designing an auxiliary agent that works together with the executive agent for optimal policy exploration, the learning system can focus on exploring the policy with higher risk-adjusted return in the action space with positive return and low variance. In this way, we can overcome the issue of the curse of dimensionality and improve the training efficiency in the positive reward sparse environment. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06832v1-abstract-full').style.display = 'none'; document.getElementById('2501.06832v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.03565">arXiv:2501.03565</a> <span> [<a href="https://arxiv.org/pdf/2501.03565">pdf</a>, <a href="https://arxiv.org/format/2501.03565">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Bridged Semantic Alignment for Zero-shot 3D Medical Image Diagnosis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lai%2C+H">Haoran Lai</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Zihang Jiang</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+Q">Qingsong Yao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+R">Rongsheng Wang</a>, <a href="/search/cs?searchtype=author&query=He%2C+Z">Zhiyang He</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+X">Xiaodong Tao</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+W">Wei Wei</a>, <a href="/search/cs?searchtype=author&query=Lv%2C+W">Weifu Lv</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+S+K">S. Kevin Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.03565v1-abstract-short" style="display: inline;"> 3D medical images such as Computed tomography (CT) are widely used in clinical practice, offering a great potential for automatic diagnosis. Supervised learning-based approaches have achieved significant progress but rely heavily on extensive manual annotations, limited by the availability of training data and the diversity of abnormality types. Vision-language alignment (VLA) offers a promising a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03565v1-abstract-full').style.display = 'inline'; document.getElementById('2501.03565v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.03565v1-abstract-full" style="display: none;"> 3D medical images such as Computed tomography (CT) are widely used in clinical practice, offering a great potential for automatic diagnosis. Supervised learning-based approaches have achieved significant progress but rely heavily on extensive manual annotations, limited by the availability of training data and the diversity of abnormality types. Vision-language alignment (VLA) offers a promising alternative by enabling zero-shot learning without additional annotations. However, we empirically discover that the visual and textural embeddings after alignment endeavors from existing VLA methods form two well-separated clusters, presenting a wide gap to be bridged. To bridge this gap, we propose a Bridged Semantic Alignment (BrgSA) framework. First, we utilize a large language model to perform semantic summarization of reports, extracting high-level semantic information. Second, we design a Cross-Modal Knowledge Interaction (CMKI) module that leverages a cross-modal knowledge bank as a semantic bridge, facilitating interaction between the two modalities, narrowing the gap, and improving their alignment. To comprehensively evaluate our method, we construct a benchmark dataset that includes 15 underrepresented abnormalities as well as utilize two existing benchmark datasets. Experimental results demonstrate that BrgSA achieves state-of-the-art performances on both public benchmark datasets and our custom-labeled dataset, with significant improvements in zero-shot diagnosis of underrepresented abnormalities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03565v1-abstract-full').style.display = 'none'; document.getElementById('2501.03565v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.02487">arXiv:2501.02487</a> <span> [<a href="https://arxiv.org/pdf/2501.02487">pdf</a>, <a href="https://arxiv.org/format/2501.02487">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ACE++: Instruction-Based Image Creation and Editing via Context-Aware Content Filling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mao%2C+C">Chaojie Mao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jingfeng Zhang</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+Y">Yulin Pan</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Zeyinzi Jiang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zhen Han</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yu Liu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jingren Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.02487v3-abstract-short" style="display: inline;"> We report ACE++, an instruction-based diffusion framework that tackles various image generation and editing tasks. Inspired by the input format for the inpainting task proposed by FLUX.1-Fill-dev, we improve the Long-context Condition Unit (LCU) introduced in ACE and extend this input paradigm to any editing and generation tasks. To take full advantage of image generative priors, we develop a two-… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02487v3-abstract-full').style.display = 'inline'; document.getElementById('2501.02487v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.02487v3-abstract-full" style="display: none;"> We report ACE++, an instruction-based diffusion framework that tackles various image generation and editing tasks. Inspired by the input format for the inpainting task proposed by FLUX.1-Fill-dev, we improve the Long-context Condition Unit (LCU) introduced in ACE and extend this input paradigm to any editing and generation tasks. To take full advantage of image generative priors, we develop a two-stage training scheme to minimize the efforts of finetuning powerful text-to-image diffusion models like FLUX.1-dev. In the first stage, we pre-train the model using task data with the 0-ref tasks from the text-to-image model. There are many models in the community based on the post-training of text-to-image foundational models that meet this training paradigm of the first stage. For example, FLUX.1-Fill-dev deals primarily with painting tasks and can be used as an initialization to accelerate the training process. In the second stage, we finetune the above model to support the general instructions using all tasks defined in ACE. To promote the widespread application of ACE++ in different scenarios, we provide a comprehensive set of models that cover both full finetuning and lightweight finetuning, while considering general applicability and applicability in vertical scenarios. The qualitative analysis showcases the superiority of ACE++ in terms of generating image quality and prompt following ability. Code and models will be available on the project page: https://ali-vilab. github.io/ACE_plus_page/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02487v3-abstract-full').style.display = 'none'; document.getElementById('2501.02487v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.02385">arXiv:2501.02385</a> <span> [<a href="https://arxiv.org/pdf/2501.02385">pdf</a>, <a href="https://arxiv.org/format/2501.02385">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Guiding Medical Vision-Language Models with Explicit Visual Prompts: Framework Design and Comprehensive Exploration of Prompt Variations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+K">Kangyu Zhu</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+Z">Ziyuan Qin</a>, <a href="/search/cs?searchtype=author&query=Yi%2C+H">Huahui Yi</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Zekun Jiang</a>, <a href="/search/cs?searchtype=author&query=Lao%2C+Q">Qicheng Lao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shaoting Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+K">Kang Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.02385v2-abstract-short" style="display: inline;"> While mainstream vision-language models (VLMs) have advanced rapidly in understanding image level information, they still lack the ability to focus on specific areas designated by humans. Rather, they typically rely on large volumes of high-quality image-text paired data to learn and generate posterior attention maps. To address this critical issue, we propose leveraging visual prompts:simple visu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02385v2-abstract-full').style.display = 'inline'; document.getElementById('2501.02385v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.02385v2-abstract-full" style="display: none;"> While mainstream vision-language models (VLMs) have advanced rapidly in understanding image level information, they still lack the ability to focus on specific areas designated by humans. Rather, they typically rely on large volumes of high-quality image-text paired data to learn and generate posterior attention maps. To address this critical issue, we propose leveraging visual prompts:simple visual markers in various forms to guide and enhance the formation of region-specific attention. Thus, we introduce MedVP, a pioneering framework that integrates medical entity extraction, visual prompt generation, and dataset adaptation for visual prompt guided fine-tuning. We successfully outperform recent state-of-the-art large models across multiple medical VQA datasets. Extensive experiments and Human evaluation are conducted to analyze the impact of different visual prompt forms and how they contribute to performance improvement. The results demonstrate both the effectiveness and clinical significance of our approach. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02385v2-abstract-full').style.display = 'none'; document.getElementById('2501.02385v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to NAACL 2025 Main Conference</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.02321">arXiv:2501.02321</a> <span> [<a href="https://arxiv.org/pdf/2501.02321">pdf</a>, <a href="https://arxiv.org/format/2501.02321">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> KD-MSLRT: Lightweight Sign Language Recognition Model Based on Mediapipe and 3D to 1D Knowledge Distillation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yulong Li</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+B">Bolin Ren</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+K">Ke Hu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+C">Changyuan Liu</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Zhengyong Jiang</a>, <a href="/search/cs?searchtype=author&query=Dang%2C+K">Kang Dang</a>, <a href="/search/cs?searchtype=author&query=Su%2C+J">Jionglong Su</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.02321v3-abstract-short" style="display: inline;"> Artificial intelligence has achieved notable results in sign language recognition and translation. However, relatively few efforts have been made to significantly improve the quality of life for the 72 million hearing-impaired people worldwide. Sign language translation models, relying on video inputs, involves with large parameter sizes, making it time-consuming and computationally intensive to b… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02321v3-abstract-full').style.display = 'inline'; document.getElementById('2501.02321v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.02321v3-abstract-full" style="display: none;"> Artificial intelligence has achieved notable results in sign language recognition and translation. However, relatively few efforts have been made to significantly improve the quality of life for the 72 million hearing-impaired people worldwide. Sign language translation models, relying on video inputs, involves with large parameter sizes, making it time-consuming and computationally intensive to be deployed. This directly contributes to the scarcity of human-centered technology in this field. Additionally, the lack of datasets in sign language translation hampers research progress in this area. To address these, we first propose a cross-modal multi-knowledge distillation technique from 3D to 1D and a novel end-to-end pre-training text correction framework. Compared to other pre-trained models, our framework achieves significant advancements in correcting text output errors. Our model achieves a decrease in Word Error Rate (WER) of at least 1.4% on PHOENIX14 and PHOENIX14T datasets compared to the state-of-the-art CorrNet. Additionally, the TensorFlow Lite (TFLite) quantized model size is reduced to 12.93 MB, making it the smallest, fastest, and most accurate model to date. We have also collected and released extensive Chinese sign language datasets, and developed a specialized training vocabulary. To address the lack of research on data augmentation for landmark data, we have designed comparative experiments on various augmentation methods. Moreover, we performed a simulated deployment and prediction of our model on Intel platform CPUs and assessed the feasibility of deploying the model on other platforms. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02321v3-abstract-full').style.display = 'none'; document.getElementById('2501.02321v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">AAAI 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.01895">arXiv:2501.01895</a> <span> [<a href="https://arxiv.org/pdf/2501.01895">pdf</a>, <a href="https://arxiv.org/format/2501.01895">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> EnerVerse: Envisioning Embodied Future Space for Robotics Manipulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+S">Siyuan Huang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Liliang Chen</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+P">Pengfei Zhou</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Shengcong Chen</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Zhengkai Jiang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yue Hu</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+Y">Yue Liao</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+P">Peng Gao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Hongsheng Li</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+M">Maoqing Yao</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+G">Guanghui Ren</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.01895v2-abstract-short" style="display: inline;"> We introduce EnerVerse, a generative robotics foundation model that constructs and interprets embodied spaces. EnerVerse employs an autoregressive video diffusion framework to predict future embodied spaces from instructions, enhanced by a sparse context memory for long-term reasoning. To model the 3D robotics world, we propose Free Anchor Views (FAVs), a multi-view video representation offering f… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.01895v2-abstract-full').style.display = 'inline'; document.getElementById('2501.01895v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.01895v2-abstract-full" style="display: none;"> We introduce EnerVerse, a generative robotics foundation model that constructs and interprets embodied spaces. EnerVerse employs an autoregressive video diffusion framework to predict future embodied spaces from instructions, enhanced by a sparse context memory for long-term reasoning. To model the 3D robotics world, we propose Free Anchor Views (FAVs), a multi-view video representation offering flexible, task-adaptive perspectives to address challenges like motion ambiguity and environmental constraints. Additionally, we present EnerVerse-D, a data engine pipeline combining the generative model with 4D Gaussian Splatting, forming a self-reinforcing data loop to reduce the sim-to-real gap. Leveraging these innovations, EnerVerse translates 4D world representations into physical actions via a policy head (EnerVerse-A), enabling robots to execute task instructions. EnerVerse-A achieves state-of-the-art performance in both simulation and real-world settings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.01895v2-abstract-full').style.display = 'none'; document.getElementById('2501.01895v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Website: https://sites.google.com/view/enerverse</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.00332">arXiv:2501.00332</a> <span> [<a href="https://arxiv.org/pdf/2501.00332">pdf</a>, <a href="https://arxiv.org/format/2501.00332">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> MAIN-RAG: Multi-Agent Filtering Retrieval-Augmented Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chang%2C+C">Chia-Yuan Chang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Zhimeng Jiang</a>, <a href="/search/cs?searchtype=author&query=Rakesh%2C+V">Vineeth Rakesh</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+M">Menghai Pan</a>, <a href="/search/cs?searchtype=author&query=Yeh%2C+C+M">Chin-Chia Michael Yeh</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guanchu Wang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+M">Mingzhi Hu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zhichao Xu</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Y">Yan Zheng</a>, <a href="/search/cs?searchtype=author&query=Das%2C+M">Mahashweta Das</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+N">Na Zou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.00332v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) are becoming essential tools for various natural language processing tasks but often suffer from generating outdated or incorrect information. Retrieval-Augmented Generation (RAG) addresses this issue by incorporating external, real-time information retrieval to ground LLM responses. However, the existing RAG systems frequently struggle with the quality of retrieval do… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.00332v1-abstract-full').style.display = 'inline'; document.getElementById('2501.00332v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.00332v1-abstract-full" style="display: none;"> Large Language Models (LLMs) are becoming essential tools for various natural language processing tasks but often suffer from generating outdated or incorrect information. Retrieval-Augmented Generation (RAG) addresses this issue by incorporating external, real-time information retrieval to ground LLM responses. However, the existing RAG systems frequently struggle with the quality of retrieval documents, as irrelevant or noisy documents degrade performance, increase computational overhead, and undermine response reliability. To tackle this problem, we propose Multi-Agent Filtering Retrieval-Augmented Generation (MAIN-RAG), a training-free RAG framework that leverages multiple LLM agents to collaboratively filter and score retrieved documents. Specifically, MAIN-RAG introduces an adaptive filtering mechanism that dynamically adjusts the relevance filtering threshold based on score distributions, effectively minimizing noise while maintaining high recall of relevant documents. The proposed approach leverages inter-agent consensus to ensure robust document selection without requiring additional training data or fine-tuning. Experimental results across four QA benchmarks demonstrate that MAIN-RAG consistently outperforms traditional RAG approaches, achieving a 2-11% improvement in answer accuracy while reducing the number of irrelevant retrieved documents. Quantitative analysis further reveals that our approach achieves superior response consistency and answer accuracy over baseline methods, offering a competitive and practical alternative to training-based solutions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.00332v1-abstract-full').style.display = 'none'; document.getElementById('2501.00332v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.20564">arXiv:2412.20564</a> <span> [<a href="https://arxiv.org/pdf/2412.20564">pdf</a>, <a href="https://arxiv.org/format/2412.20564">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Self-Disclosure to AI: The Paradox of Trust and Vulnerability in Human-Machine Interactions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+Z+Z">Zoe Zhiqiu Jiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.20564v1-abstract-short" style="display: inline;"> In this paper, we explore the paradox of trust and vulnerability in human-machine interactions, inspired by Alexander Reben's BlabDroid project. This project used small, unassuming robots that actively engaged with people, successfully eliciting personal thoughts or secrets from individuals, often more effectively than human counterparts. This phenomenon raises intriguing questions about how trust… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.20564v1-abstract-full').style.display = 'inline'; document.getElementById('2412.20564v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.20564v1-abstract-full" style="display: none;"> In this paper, we explore the paradox of trust and vulnerability in human-machine interactions, inspired by Alexander Reben's BlabDroid project. This project used small, unassuming robots that actively engaged with people, successfully eliciting personal thoughts or secrets from individuals, often more effectively than human counterparts. This phenomenon raises intriguing questions about how trust and self-disclosure operate in interactions with machines, even in their simplest forms. We study the change of trust in technology through analyzing the psychological processes behind such encounters. The analysis applies theories like Social Penetration Theory and Communication Privacy Management Theory to understand the balance between perceived security and the risk of exposure when personal information and secrets are shared with machines or AI. Additionally, we draw on philosophical perspectives, such as posthumanism and phenomenology, to engage with broader questions about trust, privacy, and vulnerability in the digital age. Rapid incorporation of AI into our most private areas challenges us to rethink and redefine our ethical responsibilities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.20564v1-abstract-full').style.display = 'none'; document.getElementById('2412.20564v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at NeurIPS 2024 Creative AI</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.19498">arXiv:2412.19498</a> <span> [<a href="https://arxiv.org/pdf/2412.19498">pdf</a>, <a href="https://arxiv.org/format/2412.19498">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> </div> </div> <p class="title is-5 mathjax"> Casevo: A Cognitive Agents and Social Evolution Simulator </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Zexun Jiang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yafang Shi</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Maoxu Li</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+H">Hongjiang Xiao</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+Y">Yunxiao Qin</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+Q">Qinglan Wei</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Ye Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yuan Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.19498v1-abstract-short" style="display: inline;"> In this paper, we introduce a multi-agent simulation framework Casevo (Cognitive Agents and Social Evolution Simulator), that integrates large language models (LLMs) to simulate complex social phenomena and decision-making processes. Casevo is designed as a discrete-event simulator driven by agents with features such as Chain of Thoughts (CoT), Retrieval-Augmented Generation (RAG), and Customizabl… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.19498v1-abstract-full').style.display = 'inline'; document.getElementById('2412.19498v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.19498v1-abstract-full" style="display: none;"> In this paper, we introduce a multi-agent simulation framework Casevo (Cognitive Agents and Social Evolution Simulator), that integrates large language models (LLMs) to simulate complex social phenomena and decision-making processes. Casevo is designed as a discrete-event simulator driven by agents with features such as Chain of Thoughts (CoT), Retrieval-Augmented Generation (RAG), and Customizable Memory Mechanism. Casevo enables dynamic social modeling, which can support various scenarios such as social network analysis, public opinion dynamics, and behavior prediction in complex social systems. To demonstrate the effectiveness of Casevo, we utilize one of the U.S. 2020 midterm election TV debates as a simulation example. Our results show that Casevo facilitates more realistic and flexible agent interactions, improving the quality of dynamic social phenomena simulation. This work contributes to the field by providing a robust system for studying large-scale, high-fidelity social behaviors with advanced LLM-driven agents, expanding the capabilities of traditional agent-based modeling (ABM). The open-source code repository address of casevo is https://github.com/rgCASS/casevo. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.19498v1-abstract-full').style.display = 'none'; document.getElementById('2412.19498v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.18904">arXiv:2412.18904</a> <span> [<a href="https://arxiv.org/pdf/2412.18904">pdf</a>, <a href="https://arxiv.org/format/2412.18904">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> FedCFA: Alleviating Simpson's Paradox in Model Aggregation with Counterfactual Federated Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Zhonghua Jiang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jimin Xu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shengyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+T">Tao Shen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jiwei Li</a>, <a href="/search/cs?searchtype=author&query=Kuang%2C+K">Kun Kuang</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+H">Haibin Cai</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+F">Fei Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.18904v1-abstract-short" style="display: inline;"> Federated learning (FL) is a promising technology for data privacy and distributed optimization, but it suffers from data imbalance and heterogeneity among clients. Existing FL methods try to solve the problems by aligning client with server model or by correcting client model with control variables. These methods excel on IID and general Non-IID data but perform mediocrely in Simpson's Paradox sc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18904v1-abstract-full').style.display = 'inline'; document.getElementById('2412.18904v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.18904v1-abstract-full" style="display: none;"> Federated learning (FL) is a promising technology for data privacy and distributed optimization, but it suffers from data imbalance and heterogeneity among clients. Existing FL methods try to solve the problems by aligning client with server model or by correcting client model with control variables. These methods excel on IID and general Non-IID data but perform mediocrely in Simpson's Paradox scenarios. Simpson's Paradox refers to the phenomenon that the trend observed on the global dataset disappears or reverses on a subset, which may lead to the fact that global model obtained through aggregation in FL does not accurately reflect the distribution of global data. Thus, we propose FedCFA, a novel FL framework employing counterfactual learning to generate counterfactual samples by replacing local data critical factors with global average data, aligning local data distributions with the global and mitigating Simpson's Paradox effects. In addition, to improve the quality of counterfactual samples, we introduce factor decorrelation (FDC) loss to reduce the correlation among features and thus improve the independence of extracted factors. We conduct extensive experiments on six datasets and verify that our method outperforms other FL methods in terms of efficiency and global model accuracy under limited communication rounds. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18904v1-abstract-full').style.display = 'none'; document.getElementById('2412.18904v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.18696">arXiv:2412.18696</a> <span> [<a href="https://arxiv.org/pdf/2412.18696">pdf</a>, <a href="https://arxiv.org/format/2412.18696">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> STITCH: Surface reconstrucTion using Implicit neural representations with Topology Constraints and persistent Homology </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jignasu%2C+A">Anushrut Jignasu</a>, <a href="/search/cs?searchtype=author&query=Herron%2C+E">Ethan Herron</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Zhanhong Jiang</a>, <a href="/search/cs?searchtype=author&query=Sarkar%2C+S">Soumik Sarkar</a>, <a href="/search/cs?searchtype=author&query=Hegde%2C+C">Chinmay Hegde</a>, <a href="/search/cs?searchtype=author&query=Ganapathysubramanian%2C+B">Baskar Ganapathysubramanian</a>, <a href="/search/cs?searchtype=author&query=Balu%2C+A">Aditya Balu</a>, <a href="/search/cs?searchtype=author&query=Krishnamurthy%2C+A">Adarsh Krishnamurthy</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.18696v2-abstract-short" style="display: inline;"> We present STITCH, a novel approach for neural implicit surface reconstruction of a sparse and irregularly spaced point cloud while enforcing topological constraints (such as having a single connected component). We develop a new differentiable framework based on persistent homology to formulate topological loss terms that enforce the prior of a single 2-manifold object. Our method demonstrates ex… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18696v2-abstract-full').style.display = 'inline'; document.getElementById('2412.18696v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.18696v2-abstract-full" style="display: none;"> We present STITCH, a novel approach for neural implicit surface reconstruction of a sparse and irregularly spaced point cloud while enforcing topological constraints (such as having a single connected component). We develop a new differentiable framework based on persistent homology to formulate topological loss terms that enforce the prior of a single 2-manifold object. Our method demonstrates excellent performance in preserving the topology of complex 3D geometries, evident through both visual and empirical comparisons. We supplement this with a theoretical analysis, and provably show that optimizing the loss with stochastic (sub)gradient descent leads to convergence and enables reconstructing shapes with a single connected component. Our approach showcases the integration of differentiable topological data analysis tools for implicit surface reconstruction. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18696v2-abstract-full').style.display = 'none'; document.getElementById('2412.18696v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">19 pages, 12 figures, 29 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.18431">arXiv:2412.18431</a> <span> [<a href="https://arxiv.org/pdf/2412.18431">pdf</a>, <a href="https://arxiv.org/format/2412.18431">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> GeAR: Graph-enhanced Agent for Retrieval-augmented Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shen%2C+Z">Zhili Shen</a>, <a href="/search/cs?searchtype=author&query=Diao%2C+C">Chenxin Diao</a>, <a href="/search/cs?searchtype=author&query=Vougiouklis%2C+P">Pavlos Vougiouklis</a>, <a href="/search/cs?searchtype=author&query=Merita%2C+P">Pascual Merita</a>, <a href="/search/cs?searchtype=author&query=Piramanayagam%2C+S">Shriram Piramanayagam</a>, <a href="/search/cs?searchtype=author&query=Graux%2C+D">Damien Graux</a>, <a href="/search/cs?searchtype=author&query=Tu%2C+D">Dandan Tu</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Zeren Jiang</a>, <a href="/search/cs?searchtype=author&query=Lai%2C+R">Ruofei Lai</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+Y">Yang Ren</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+J+Z">Jeff Z. Pan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.18431v1-abstract-short" style="display: inline;"> Retrieval-augmented generation systems rely on effective document retrieval capabilities. By design, conventional sparse or dense retrievers face challenges in multi-hop retrieval scenarios. In this paper, we present GeAR, which advances RAG performance through two key innovations: (i) graph expansion, which enhances any conventional base retriever, such as BM25, and (ii) an agent framework that i… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18431v1-abstract-full').style.display = 'inline'; document.getElementById('2412.18431v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.18431v1-abstract-full" style="display: none;"> Retrieval-augmented generation systems rely on effective document retrieval capabilities. By design, conventional sparse or dense retrievers face challenges in multi-hop retrieval scenarios. In this paper, we present GeAR, which advances RAG performance through two key innovations: (i) graph expansion, which enhances any conventional base retriever, such as BM25, and (ii) an agent framework that incorporates graph expansion. Our evaluation demonstrates GeAR's superior retrieval performance on three multi-hop question answering datasets. Additionally, our system achieves state-of-the-art results with improvements exceeding 10% on the challenging MuSiQue dataset, while requiring fewer tokens and iterations compared to other multi-step retrieval systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18431v1-abstract-full').style.display = 'none'; document.getElementById('2412.18431v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.18216">arXiv:2412.18216</a> <span> [<a href="https://arxiv.org/pdf/2412.18216">pdf</a>, <a href="https://arxiv.org/format/2412.18216">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> ICM-Assistant: Instruction-tuning Multimodal Large Language Models for Rule-based Explainable Image Content Moderation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+M">Mengyang Wu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yuzhi Zhao</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+J">Jialun Cao</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+M">Mingjie Xu</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Zhongming Jiang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xuehui Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Q">Qinbin Li</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+G">Guangneng Hu</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+S">Shengchao Qin</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+C">Chi-Wing Fu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.18216v2-abstract-short" style="display: inline;"> Controversial contents largely inundate the Internet, infringing various cultural norms and child protection standards. Traditional Image Content Moderation (ICM) models fall short in producing precise moderation decisions for diverse standards, while recent multimodal large language models (MLLMs), when adopted to general rule-based ICM, often produce classification and explanation results that a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18216v2-abstract-full').style.display = 'inline'; document.getElementById('2412.18216v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.18216v2-abstract-full" style="display: none;"> Controversial contents largely inundate the Internet, infringing various cultural norms and child protection standards. Traditional Image Content Moderation (ICM) models fall short in producing precise moderation decisions for diverse standards, while recent multimodal large language models (MLLMs), when adopted to general rule-based ICM, often produce classification and explanation results that are inconsistent with human moderators. Aiming at flexible, explainable, and accurate ICM, we design a novel rule-based dataset generation pipeline, decomposing concise human-defined rules and leveraging well-designed multi-stage prompts to enrich short explicit image annotations. Our ICM-Instruct dataset includes detailed moderation explanation and moderation Q-A pairs. Built upon it, we create our ICM-Assistant model in the framework of rule-based ICM, making it readily applicable in real practice. Our ICM-Assistant model demonstrates exceptional performance and flexibility. Specifically, it significantly outperforms existing approaches on various sources, improving both the moderation classification (36.8% on average) and moderation explanation quality (26.6% on average) consistently over existing MLLMs. Code/Data is available at https://github.com/zhaoyuzhi/ICM-Assistant. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18216v2-abstract-full').style.display = 'none'; document.getElementById('2412.18216v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by the AAAI 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.18053">arXiv:2412.18053</a> <span> [<a href="https://arxiv.org/pdf/2412.18053">pdf</a>, <a href="https://arxiv.org/format/2412.18053">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Neuron Empirical Gradient: Connecting Neurons' Linear Controllability and Representational Capacity </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+X">Xin Zhao</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Zehui Jiang</a>, <a href="/search/cs?searchtype=author&query=Yoshinaga%2C+N">Naoki Yoshinaga</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.18053v1-abstract-short" style="display: inline;"> Although neurons in the feed-forward layers of pre-trained language models (PLMs) can store factual knowledge, most prior analyses remain qualitative, leaving the quantitative relationship among knowledge representation, neuron activations, and model output poorly understood. In this study, by performing neuron-wise interventions using factual probing datasets, we first reveal the linear relations… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18053v1-abstract-full').style.display = 'inline'; document.getElementById('2412.18053v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.18053v1-abstract-full" style="display: none;"> Although neurons in the feed-forward layers of pre-trained language models (PLMs) can store factual knowledge, most prior analyses remain qualitative, leaving the quantitative relationship among knowledge representation, neuron activations, and model output poorly understood. In this study, by performing neuron-wise interventions using factual probing datasets, we first reveal the linear relationship between neuron activations and output token probabilities. We refer to the gradient of this linear relationship as ``neuron empirical gradients.'' and propose NeurGrad, an efficient method for their calculation to facilitate quantitative neuron analysis. We next investigate whether neuron empirical gradients in PLMs encode general task knowledge by probing skill neurons. To this end, we introduce MCEval8k, a multi-choice knowledge evaluation benchmark spanning six genres and 22 tasks. Our experiments confirm that neuron empirical gradients effectively capture knowledge, while skill neurons exhibit efficiency, generality, inclusivity, and interdependency. These findings link knowledge to PLM outputs via neuron empirical gradients, shedding light on how PLMs store knowledge. The code and dataset are released. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18053v1-abstract-full').style.display = 'none'; document.getElementById('2412.18053v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">29 pages, 18 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.16615">arXiv:2412.16615</a> <span> [<a href="https://arxiv.org/pdf/2412.16615">pdf</a>, <a href="https://arxiv.org/format/2412.16615">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Large Language Model Can Be a Foundation for Hidden Rationale-Based Retrieval </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ji%2C+L">Luo Ji</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+F">Feixiang Guo</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Teng Chen</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+Q">Qingqing Gu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiaoyu Wang</a>, <a href="/search/cs?searchtype=author&query=Xi%2C+N">Ningyuan Xi</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yihong Wang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+P">Peng Yu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yue Zhao</a>, <a href="/search/cs?searchtype=author&query=Lei%2C+H">Hongyang Lei</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Zhonglin Jiang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yong Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.16615v1-abstract-short" style="display: inline;"> Despite the recent advancement in Retrieval-Augmented Generation (RAG) systems, most retrieval methodologies are often developed for factual retrieval, which assumes query and positive documents are semantically similar. In this paper, we instead propose and study a more challenging type of retrieval task, called hidden rationale retrieval, in which query and document are not similar but can be in… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16615v1-abstract-full').style.display = 'inline'; document.getElementById('2412.16615v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.16615v1-abstract-full" style="display: none;"> Despite the recent advancement in Retrieval-Augmented Generation (RAG) systems, most retrieval methodologies are often developed for factual retrieval, which assumes query and positive documents are semantically similar. In this paper, we instead propose and study a more challenging type of retrieval task, called hidden rationale retrieval, in which query and document are not similar but can be inferred by reasoning chains, logic relationships, or empirical experiences. To address such problems, an instruction-tuned Large language model (LLM) with a cross-encoder architecture could be a reasonable choice. To further strengthen pioneering LLM-based retrievers, we design a special instruction that transforms the retrieval task into a generative task by prompting LLM to answer a binary-choice question. The model can be fine-tuned with direct preference optimization (DPO). The framework is also optimized for computational efficiency with no performance degradation. We name this retrieval framework by RaHoRe and verify its zero-shot and fine-tuned performance superiority on Emotional Support Conversation (ESC), compared with previous retrieval works. Our study suggests the potential to employ LLM as a foundation for a wider scope of retrieval tasks. Our codes, models, and datasets are available on https://github.com/flyfree5/LaHoRe. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16615v1-abstract-full').style.display = 'none'; document.getElementById('2412.16615v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 3 figures, accepted by ECIR 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.16523">arXiv:2412.16523</a> <span> [<a href="https://arxiv.org/pdf/2412.16523">pdf</a>, <a href="https://arxiv.org/format/2412.16523">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Physics and Society">physics.soc-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Physics-Guided Fair Graph Sampling for Water Temperature Prediction in River Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+E">Erhu He</a>, <a href="/search/cs?searchtype=author&query=Kutscher%2C+D">Declan Kutscher</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+Y">Yiqun Xie</a>, <a href="/search/cs?searchtype=author&query=Zwart%2C+J">Jacob Zwart</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Zhe Jiang</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+H">Huaxiu Yao</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+X">Xiaowei Jia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.16523v1-abstract-short" style="display: inline;"> This work introduces a novel graph neural networks (GNNs)-based method to predict stream water temperature and reduce model bias across locations of different income and education levels. Traditional physics-based models often have limited accuracy because they are necessarily approximations of reality. Recently, there has been an increasing interest of using GNNs in modeling complex water dynamic… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16523v1-abstract-full').style.display = 'inline'; document.getElementById('2412.16523v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.16523v1-abstract-full" style="display: none;"> This work introduces a novel graph neural networks (GNNs)-based method to predict stream water temperature and reduce model bias across locations of different income and education levels. Traditional physics-based models often have limited accuracy because they are necessarily approximations of reality. Recently, there has been an increasing interest of using GNNs in modeling complex water dynamics in stream networks. Despite their promise in improving the accuracy, GNNs can bring additional model bias through the aggregation process, where node features are updated by aggregating neighboring nodes. The bias can be especially pronounced when nodes with similar sensitive attributes are frequently connected. We introduce a new method that leverages physical knowledge to represent the node influence in GNNs, and then utilizes physics-based influence to refine the selection and weights over the neighbors. The objective is to facilitate equitable treatment over different sensitive groups in the graph aggregation, which helps reduce spatial bias over locations, especially for those in underprivileged groups. The results on the Delaware River Basin demonstrate the effectiveness of the proposed method in preserving equitable performance across locations in different sensitive groups. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16523v1-abstract-full').style.display = 'none'; document.getElementById('2412.16523v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.16264">arXiv:2412.16264</a> <span> [<a href="https://arxiv.org/pdf/2412.16264">pdf</a>, <a href="https://arxiv.org/format/2412.16264">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Continual Learning with Strategic Selection and Forgetting for Network Intrusion Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xinchen Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+R">Running Zhao</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Zhihan Jiang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Handi Chen</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+Y">Yulong Ding</a>, <a href="/search/cs?searchtype=author&query=Ngai%2C+E+C+H">Edith C. H. Ngai</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Shuang-Hua Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.16264v3-abstract-short" style="display: inline;"> Intrusion Detection Systems (IDS) are crucial for safeguarding digital infrastructure. In dynamic network environments, both threat landscapes and normal operational behaviors are constantly changing, resulting in concept drift. While continuous learning mitigates the adverse effects of concept drift, insufficient attention to drift patterns and excessive preservation of outdated knowledge can sti… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16264v3-abstract-full').style.display = 'inline'; document.getElementById('2412.16264v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.16264v3-abstract-full" style="display: none;"> Intrusion Detection Systems (IDS) are crucial for safeguarding digital infrastructure. In dynamic network environments, both threat landscapes and normal operational behaviors are constantly changing, resulting in concept drift. While continuous learning mitigates the adverse effects of concept drift, insufficient attention to drift patterns and excessive preservation of outdated knowledge can still hinder the IDS's adaptability. In this paper, we propose SSF (Strategic Selection and Forgetting), a novel continual learning method for IDS, providing continuous model updates with a constantly refreshed memory buffer. Our approach features a strategic sample selection algorithm to select representative new samples and a strategic forgetting mechanism to drop outdated samples. The proposed strategic sample selection algorithm prioritizes new samples that cause the `drifted' pattern, enabling the model to better understand the evolving landscape. Additionally, we introduce strategic forgetting upon detecting significant drift by discarding outdated samples to free up memory, allowing the incorporation of more recent data. SSF captures evolving patterns effectively and ensures the model is aligned with the change of data patterns, significantly enhancing the IDS's adaptability to concept drift. The state-of-the-art performance of SSF on NSL-KDD and UNSW-NB15 datasets demonstrates its superior adaptability to concept drift for network intrusion detection. The code is released at https://github.com/xinchen930/SSF-Strategic-Selection-and-Forgetting. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16264v3-abstract-full').style.display = 'none'; document.getElementById('2412.16264v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by IEEE International Conference on Computer Communications (INFOCOM) 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.14494">arXiv:2412.14494</a> <span> [<a href="https://arxiv.org/pdf/2412.14494">pdf</a>, <a href="https://arxiv.org/format/2412.14494">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Drive-1-to-3: Enriching Diffusion Priors for Novel View Synthesis of Real Vehicles </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+C">Chuang Lin</a>, <a href="/search/cs?searchtype=author&query=Zhuang%2C+B">Bingbing Zhuang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+S">Shanlin Sun</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Ziyu Jiang</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+J">Jianfei Cai</a>, <a href="/search/cs?searchtype=author&query=Chandraker%2C+M">Manmohan Chandraker</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.14494v1-abstract-short" style="display: inline;"> The recent advent of large-scale 3D data, e.g. Objaverse, has led to impressive progress in training pose-conditioned diffusion models for novel view synthesis. However, due to the synthetic nature of such 3D data, their performance drops significantly when applied to real-world images. This paper consolidates a set of good practices to finetune large pretrained models for a real-world task -- har… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.14494v1-abstract-full').style.display = 'inline'; document.getElementById('2412.14494v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.14494v1-abstract-full" style="display: none;"> The recent advent of large-scale 3D data, e.g. Objaverse, has led to impressive progress in training pose-conditioned diffusion models for novel view synthesis. However, due to the synthetic nature of such 3D data, their performance drops significantly when applied to real-world images. This paper consolidates a set of good practices to finetune large pretrained models for a real-world task -- harvesting vehicle assets for autonomous driving applications. To this end, we delve into the discrepancies between the synthetic data and real driving data, then develop several strategies to account for them properly. Specifically, we start with a virtual camera rotation of real images to ensure geometric alignment with synthetic data and consistency with the pose manifold defined by pretrained models. We also identify important design choices in object-centric data curation to account for varying object distances in real driving scenes -- learn across varying object scales with fixed camera focal length. Further, we perform occlusion-aware training in latent spaces to account for ubiquitous occlusions in real data, and handle large viewpoint changes by leveraging a symmetric prior. Our insights lead to effective finetuning that results in a $68.8\%$ reduction in FID for novel view synthesis over prior arts. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.14494v1-abstract-full').style.display = 'none'; document.getElementById('2412.14494v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.14473">arXiv:2412.14473</a> <span> [<a href="https://arxiv.org/pdf/2412.14473">pdf</a>, <a href="https://arxiv.org/format/2412.14473">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Promptable Representation Distribution Learning and Data Augmentation for Gigapixel Histopathology WSI Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tang%2C+K">Kunming Tang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Zhiguo Jiang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+J">Jun Shi</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wei Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+H">Haibo Wu</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Y">Yushan Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.14473v1-abstract-short" style="display: inline;"> Gigapixel image analysis, particularly for whole slide images (WSIs), often relies on multiple instance learning (MIL). Under the paradigm of MIL, patch image representations are extracted and then fixed during the training of the MIL classifiers for efficiency consideration. However, the invariance of representations makes it difficult to perform data augmentation for WSI-level model training, wh… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.14473v1-abstract-full').style.display = 'inline'; document.getElementById('2412.14473v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.14473v1-abstract-full" style="display: none;"> Gigapixel image analysis, particularly for whole slide images (WSIs), often relies on multiple instance learning (MIL). Under the paradigm of MIL, patch image representations are extracted and then fixed during the training of the MIL classifiers for efficiency consideration. However, the invariance of representations makes it difficult to perform data augmentation for WSI-level model training, which significantly limits the performance of the downstream WSI analysis. The current data augmentation methods for gigapixel images either introduce additional computational costs or result in a loss of semantic information, which is hard to meet the requirements for efficiency and stability needed for WSI model training. In this paper, we propose a Promptable Representation Distribution Learning framework (PRDL) for both patch-level representation learning and WSI-level data augmentation. Meanwhile, we explore the use of prompts to guide data augmentation in feature space, which achieves promptable data augmentation for training robust WSI-level models. The experimental results have demonstrated that the proposed method stably outperforms state-of-the-art methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.14473v1-abstract-full').style.display = 'none'; document.getElementById('2412.14473v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by AAAI2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.14194">arXiv:2412.14194</a> <span> [<a href="https://arxiv.org/pdf/2412.14194">pdf</a>, <a href="https://arxiv.org/format/2412.14194">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Detecting Cognitive Impairment and Psychological Well-being among Older Adults Using Facial, Acoustic, Linguistic, and Cardiovascular Patterns Derived from Remote Conversations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mu%2C+X">Xiaofan Mu</a>, <a href="/search/cs?searchtype=author&query=Seyedi%2C+S">Salman Seyedi</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+I">Iris Zheng</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Zifan Jiang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Liu Chen</a>, <a href="/search/cs?searchtype=author&query=Omofojoye%2C+B">Bolaji Omofojoye</a>, <a href="/search/cs?searchtype=author&query=Hershenberg%2C+R">Rachel Hershenberg</a>, <a href="/search/cs?searchtype=author&query=Levey%2C+A+I">Allan I. Levey</a>, <a href="/search/cs?searchtype=author&query=Clifford%2C+G+D">Gari D. Clifford</a>, <a href="/search/cs?searchtype=author&query=Dodge%2C+H+H">Hiroko H. Dodge</a>, <a href="/search/cs?searchtype=author&query=Kwon%2C+H">Hyeokhyen Kwon</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.14194v3-abstract-short" style="display: inline;"> The aging society urgently requires scalable methods to monitor cognitive decline and identify social and psychological factors indicative of dementia risk in older adults. Our machine learning (ML) models captured facial, acoustic, linguistic, and cardiovascular features from 39 individuals with normal cognition or Mild Cognitive Impairment derived from remote video conversations and classified c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.14194v3-abstract-full').style.display = 'inline'; document.getElementById('2412.14194v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.14194v3-abstract-full" style="display: none;"> The aging society urgently requires scalable methods to monitor cognitive decline and identify social and psychological factors indicative of dementia risk in older adults. Our machine learning (ML) models captured facial, acoustic, linguistic, and cardiovascular features from 39 individuals with normal cognition or Mild Cognitive Impairment derived from remote video conversations and classified cognitive status, social isolation, neuroticism, and psychological well-being. Our model could distinguish Clinical Dementia Rating Scale (CDR) of 0.5 (vs. 0) with 0.78 area under the receiver operating characteristic curve (AUC), social isolation with 0.75 AUC, neuroticism with 0.71 AUC, and negative affect scales with 0.79 AUC. Recent advances in machine learning offer new opportunities to remotely detect cognitive impairment and assess associated factors, such as neuroticism and psychological well-being. Our experiment showed that speech and language patterns were more useful for quantifying cognitive impairment, whereas facial expression and cardiovascular patterns using photoplethysmography (PPG) were more useful for quantifying personality and psychological well-being. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.14194v3-abstract-full').style.display = 'none'; document.getElementById('2412.14194v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.13917">arXiv:2412.13917</a> <span> [<a href="https://arxiv.org/pdf/2412.13917">pdf</a>, <a href="https://arxiv.org/format/2412.13917">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Speech Watermarking with Discrete Intermediate Representations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ji%2C+S">Shengpeng Ji</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Ziyue Jiang</a>, <a href="/search/cs?searchtype=author&query=Zuo%2C+J">Jialong Zuo</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+M">Minghui Fang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yifu Chen</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+T">Tao Jin</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Zhou Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.13917v1-abstract-short" style="display: inline;"> Speech watermarking techniques can proactively mitigate the potential harmful consequences of instant voice cloning techniques. These techniques involve the insertion of signals into speech that are imperceptible to humans but can be detected by algorithms. Previous approaches typically embed watermark messages into continuous space. However, intuitively, embedding watermark information into robus… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.13917v1-abstract-full').style.display = 'inline'; document.getElementById('2412.13917v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.13917v1-abstract-full" style="display: none;"> Speech watermarking techniques can proactively mitigate the potential harmful consequences of instant voice cloning techniques. These techniques involve the insertion of signals into speech that are imperceptible to humans but can be detected by algorithms. Previous approaches typically embed watermark messages into continuous space. However, intuitively, embedding watermark information into robust discrete latent space can significantly improve the robustness of watermarking systems. In this paper, we propose DiscreteWM, a novel speech watermarking framework that injects watermarks into the discrete intermediate representations of speech. Specifically, we map speech into discrete latent space with a vector-quantized autoencoder and inject watermarks by changing the modular arithmetic relation of discrete IDs. To ensure the imperceptibility of watermarks, we also propose a manipulator model to select the candidate tokens for watermark embedding. Experimental results demonstrate that our framework achieves state-of-the-art performance in robustness and imperceptibility, simultaneously. Moreover, our flexible frame-wise approach can serve as an efficient solution for both voice cloning detection and information hiding. Additionally, DiscreteWM can encode 1 to 150 bits of watermark information within a 1-second speech clip, indicating its encoding capacity. Audio samples are available at https://DiscreteWM.github.io/discrete_wm. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.13917v1-abstract-full').style.display = 'none'; document.getElementById('2412.13917v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by AAAI 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.13753">arXiv:2412.13753</a> <span> [<a href="https://arxiv.org/pdf/2412.13753">pdf</a>, <a href="https://arxiv.org/format/2412.13753">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Mesoscopic Insights: Orchestrating Multi-scale & Hybrid Architecture for Image Manipulation Localization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+X">Xuekang Zhu</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+X">Xiaochen Ma</a>, <a href="/search/cs?searchtype=author&query=Su%2C+L">Lei Su</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Zhuohang Jiang</a>, <a href="/search/cs?searchtype=author&query=Du%2C+B">Bo Du</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiwen Wang</a>, <a href="/search/cs?searchtype=author&query=Lei%2C+Z">Zeyu Lei</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+W">Wentao Feng</a>, <a href="/search/cs?searchtype=author&query=Pun%2C+C">Chi-Man Pun</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jizhe Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.13753v1-abstract-short" style="display: inline;"> The mesoscopic level serves as a bridge between the macroscopic and microscopic worlds, addressing gaps overlooked by both. Image manipulation localization (IML), a crucial technique to pursue truth from fake images, has long relied on low-level (microscopic-level) traces. However, in practice, most tampering aims to deceive the audience by altering image semantics. As a result, manipulation commo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.13753v1-abstract-full').style.display = 'inline'; document.getElementById('2412.13753v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.13753v1-abstract-full" style="display: none;"> The mesoscopic level serves as a bridge between the macroscopic and microscopic worlds, addressing gaps overlooked by both. Image manipulation localization (IML), a crucial technique to pursue truth from fake images, has long relied on low-level (microscopic-level) traces. However, in practice, most tampering aims to deceive the audience by altering image semantics. As a result, manipulation commonly occurs at the object level (macroscopic level), which is equally important as microscopic traces. Therefore, integrating these two levels into the mesoscopic level presents a new perspective for IML research. Inspired by this, our paper explores how to simultaneously construct mesoscopic representations of micro and macro information for IML and introduces the Mesorch architecture to orchestrate both. Specifically, this architecture i) combines Transformers and CNNs in parallel, with Transformers extracting macro information and CNNs capturing micro details, and ii) explores across different scales, assessing micro and macro information seamlessly. Additionally, based on the Mesorch architecture, the paper introduces two baseline models aimed at solving IML tasks through mesoscopic representation. Extensive experiments across four datasets have demonstrated that our models surpass the current state-of-the-art in terms of performance, computational complexity, and robustness. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.13753v1-abstract-full').style.display = 'none'; document.getElementById('2412.13753v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">AAAI 2025. Code: $\href{https://github.com/scu-zjz/Mesorch}{this~url}$</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.13426">arXiv:2412.13426</a> <span> [<a href="https://arxiv.org/pdf/2412.13426">pdf</a>, <a href="https://arxiv.org/format/2412.13426">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Safeguarding System Prompts for LLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Zhifeng Jiang</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+Z">Zhihua Jin</a>, <a href="/search/cs?searchtype=author&query=He%2C+G">Guoliang He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.13426v2-abstract-short" style="display: inline;"> Large language models (LLMs) are increasingly utilized in applications where system prompts, which guide model outputs, play a crucial role. These prompts often contain business logic and sensitive information, making their protection essential. However, adversarial and even regular user queries can exploit LLM vulnerabilities to expose these hidden prompts. To address this issue, we propose Promp… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.13426v2-abstract-full').style.display = 'inline'; document.getElementById('2412.13426v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.13426v2-abstract-full" style="display: none;"> Large language models (LLMs) are increasingly utilized in applications where system prompts, which guide model outputs, play a crucial role. These prompts often contain business logic and sensitive information, making their protection essential. However, adversarial and even regular user queries can exploit LLM vulnerabilities to expose these hidden prompts. To address this issue, we propose PromptKeeper, a robust defense mechanism designed to safeguard system prompts. PromptKeeper tackles two core challenges: reliably detecting prompt leakage and mitigating side-channel vulnerabilities when leakage occurs. By framing detection as a hypothesis-testing problem, PromptKeeper effectively identifies both explicit and subtle leakage. Upon detection, it regenerates responses using a dummy prompt, ensuring that outputs remain indistinguishable from typical interactions when no leakage is present. PromptKeeper ensures robust protection against prompt extraction attacks via either adversarial or regular queries, while preserving conversational capability and runtime efficiency during benign user interactions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.13426v2-abstract-full').style.display = 'none'; document.getElementById('2412.13426v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages, 5 figures, 2 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.12791">arXiv:2412.12791</a> <span> [<a href="https://arxiv.org/pdf/2412.12791">pdf</a>, <a href="https://arxiv.org/format/2412.12791">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> Implicit Location-Caption Alignment via Complementary Masking for Weakly-Supervised Dense Video Captioning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ge%2C+S">Shiping Ge</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Q">Qiang Chen</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Zhiwei Jiang</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+Y">Yafeng Yin</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+L">Liu Qin</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Ziyao Chen</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+Q">Qing Gu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.12791v2-abstract-short" style="display: inline;"> Weakly-Supervised Dense Video Captioning (WSDVC) aims to localize and describe all events of interest in a video without requiring annotations of event boundaries. This setting poses a great challenge in accurately locating the temporal location of event, as the relevant supervision is unavailable. Existing methods rely on explicit alignment constraints between event locations and captions, which… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.12791v2-abstract-full').style.display = 'inline'; document.getElementById('2412.12791v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.12791v2-abstract-full" style="display: none;"> Weakly-Supervised Dense Video Captioning (WSDVC) aims to localize and describe all events of interest in a video without requiring annotations of event boundaries. This setting poses a great challenge in accurately locating the temporal location of event, as the relevant supervision is unavailable. Existing methods rely on explicit alignment constraints between event locations and captions, which involve complex event proposal procedures during both training and inference. To tackle this problem, we propose a novel implicit location-caption alignment paradigm by complementary masking, which simplifies the complex event proposal and localization process while maintaining effectiveness. Specifically, our model comprises two components: a dual-mode video captioning module and a mask generation module. The dual-mode video captioning module captures global event information and generates descriptive captions, while the mask generation module generates differentiable positive and negative masks for localizing the events. These masks enable the implicit alignment of event locations and captions by ensuring that captions generated from positively and negatively masked videos are complementary, thereby forming a complete video description. In this way, even under weak supervision, the event location and event caption can be aligned implicitly. Extensive experiments on the public datasets demonstrate that our method outperforms existing weakly-supervised methods and achieves competitive results compared to fully-supervised methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.12791v2-abstract-full').style.display = 'none'; document.getElementById('2412.12791v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by AAAI 2025</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.10 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.11556">arXiv:2412.11556</a> <span> [<a href="https://arxiv.org/pdf/2412.11556">pdf</a>, <a href="https://arxiv.org/format/2412.11556">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Token Prepending: A Training-Free Approach for Eliciting Better Sentence Embeddings from LLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fu%2C+Y">Yuchen Fu</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+Z">Zifeng Cheng</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Zhiwei Jiang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhonghui Wang</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+Y">Yafeng Yin</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhengliang Li</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+Q">Qing Gu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.11556v1-abstract-short" style="display: inline;"> Extracting sentence embeddings from large language models (LLMs) is a promising direction, as LLMs have demonstrated stronger semantic understanding capabilities. Previous studies typically focus on prompt engineering to elicit sentence embeddings from LLMs by prompting the model to encode sentence information into the embedding of the last token. However, LLMs are mostly decoder-only models with… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.11556v1-abstract-full').style.display = 'inline'; document.getElementById('2412.11556v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.11556v1-abstract-full" style="display: none;"> Extracting sentence embeddings from large language models (LLMs) is a promising direction, as LLMs have demonstrated stronger semantic understanding capabilities. Previous studies typically focus on prompt engineering to elicit sentence embeddings from LLMs by prompting the model to encode sentence information into the embedding of the last token. However, LLMs are mostly decoder-only models with causal attention and the earlier tokens in the sentence cannot attend to the latter tokens, resulting in biased encoding of sentence information and cascading effects on the final decoded token. To this end, we propose a novel Token Prepending (TP) technique that prepends each layer's decoded sentence embedding to the beginning of the sentence in the next layer's input, allowing earlier tokens to attend to the complete sentence information under the causal attention mechanism. The proposed TP technique is a plug-and-play and training-free technique, which means it can be seamlessly integrated with various prompt-based sentence embedding methods and autoregressive LLMs. Extensive experiments on various Semantic Textual Similarity (STS) tasks and downstream classification tasks demonstrate that our proposed TP technique can significantly improve the performance of existing prompt-based sentence embedding methods across different LLMs, while incurring negligible additional inference cost. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.11556v1-abstract-full').style.display = 'none'; document.getElementById('2412.11556v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">14 pages, 5 figures</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Jiang%2C+Z&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Jiang%2C+Z&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Jiang%2C+Z&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Jiang%2C+Z&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Jiang%2C+Z&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Jiang%2C+Z&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository