Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 527 results for author: <span class="mathjax">Feng, Z</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Feng%2C+Z">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Feng, Z"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Feng%2C+Z&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Feng, Z"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Feng%2C+Z&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Feng%2C+Z&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Feng%2C+Z&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Feng%2C+Z&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Feng%2C+Z&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Feng%2C+Z&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13582">arXiv:2411.13582</a> <span> [<a href="https://arxiv.org/pdf/2411.13582">pdf</a>, <a href="https://arxiv.org/format/2411.13582">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1016/j.neucom.2024.128848">10.1016/j.neucom.2024.128848 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Deep Feature Response Discriminative Calibration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+W">Wenxiang Xu</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+T">Tian Qiu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+L">Linyun Zhou</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Z">Zunlei Feng</a>, <a href="/search/cs?searchtype=author&query=Song%2C+M">Mingli Song</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Huiqiong Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13582v1-abstract-short" style="display: inline;"> Deep neural networks (DNNs) have numerous applications across various domains. Several optimization techniques, such as ResNet and SENet, have been proposed to improve model accuracy. These techniques improve the model performance by adjusting or calibrating feature responses according to a uniform standard. However, they lack the discriminative calibration for different features, thereby introduc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13582v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13582v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13582v1-abstract-full" style="display: none;"> Deep neural networks (DNNs) have numerous applications across various domains. Several optimization techniques, such as ResNet and SENet, have been proposed to improve model accuracy. These techniques improve the model performance by adjusting or calibrating feature responses according to a uniform standard. However, they lack the discriminative calibration for different features, thereby introducing limitations in the model output. Therefore, we propose a method that discriminatively calibrates feature responses. The preliminary experimental results indicate that the neural feature response follows a Gaussian distribution. Consequently, we compute confidence values by employing the Gaussian probability density function, and then integrate these values with the original response values. The objective of this integration is to improve the feature discriminability of the neural feature response. Based on the calibration values, we propose a plugin-based calibration module incorporated into a modified ResNet architecture, termed Response Calibration Networks (ResCNet). Extensive experiments on datasets like CIFAR-10, CIFAR-100, SVHN, and ImageNet demonstrate the effectiveness of the proposed approach. The developed code is publicly available at https://github.com/tcmyxc/ResCNet. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13582v1-abstract-full').style.display = 'none'; document.getElementById('2411.13582v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Neurocomputing 2025 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13025">arXiv:2411.13025</a> <span> [<a href="https://arxiv.org/pdf/2411.13025">pdf</a>, <a href="https://arxiv.org/format/2411.13025">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ORID: Organ-Regional Information Driven Framework for Radiology Report Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gu%2C+T">Tiancheng Gu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+K">Kaicheng Yang</a>, <a href="/search/cs?searchtype=author&query=An%2C+X">Xiang An</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Z">Ziyong Feng</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+D">Dongnan Liu</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+W">Weidong Cai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13025v1-abstract-short" style="display: inline;"> The objective of Radiology Report Generation (RRG) is to automatically generate coherent textual analyses of diseases based on radiological images, thereby alleviating the workload of radiologists. Current AI-based methods for RRG primarily focus on modifications to the encoder-decoder model architecture. To advance these approaches, this paper introduces an Organ-Regional Information Driven (ORID… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13025v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13025v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13025v1-abstract-full" style="display: none;"> The objective of Radiology Report Generation (RRG) is to automatically generate coherent textual analyses of diseases based on radiological images, thereby alleviating the workload of radiologists. Current AI-based methods for RRG primarily focus on modifications to the encoder-decoder model architecture. To advance these approaches, this paper introduces an Organ-Regional Information Driven (ORID) framework which can effectively integrate multi-modal information and reduce the influence of noise from unrelated organs. Specifically, based on the LLaVA-Med, we first construct an RRG-related instruction dataset to improve organ-regional diagnosis description ability and get the LLaVA-Med-RRG. After that, we propose an organ-based cross-modal fusion module to effectively combine the information from the organ-regional diagnosis description and radiology image. To further reduce the influence of noise from unrelated organs on the radiology report generation, we introduce an organ importance coefficient analysis module, which leverages Graph Neural Network (GNN) to examine the interconnections of the cross-modal information of each organ region. Extensive experiments an1d comparisons with state-of-the-art methods across various evaluation metrics demonstrate the superior performance of our proposed method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13025v1-abstract-full').style.display = 'none'; document.getElementById('2411.13025v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 11 figures, WACV2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11770">arXiv:2411.11770</a> <span> [<a href="https://arxiv.org/pdf/2411.11770">pdf</a>, <a href="https://arxiv.org/format/2411.11770">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> CNMBert: A Model For Hanyu Pinyin Abbreviation to Character Conversion Task </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Feng%2C+Z">Zishuo Feng</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+F">Feng Cao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11770v1-abstract-short" style="display: inline;"> The task of converting Hanyu Pinyin abbreviations to Chinese characters represents a significant branch within the domain of Chinese Spelling Correction (CSC). This task is typically one of text-length alignment, however, due to the limited informational content in pinyin abbreviations, achieving accurate conversion is challenging. In this paper, we propose CNMBert which stands for zh-CN Pinyin Mu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11770v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11770v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11770v1-abstract-full" style="display: none;"> The task of converting Hanyu Pinyin abbreviations to Chinese characters represents a significant branch within the domain of Chinese Spelling Correction (CSC). This task is typically one of text-length alignment, however, due to the limited informational content in pinyin abbreviations, achieving accurate conversion is challenging. In this paper, we propose CNMBert which stands for zh-CN Pinyin Multi-mask Bert Model as a solution to this issue. CNMBert surpasses few-shot GPT models, achieving a 59.63% MRR on a 10,424-sample Hanyu Pinyin abbreviation test dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11770v1-abstract-full').style.display = 'none'; document.getElementById('2411.11770v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages, 2figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09429">arXiv:2411.09429</a> <span> [<a href="https://arxiv.org/pdf/2411.09429">pdf</a>, <a href="https://arxiv.org/format/2411.09429">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Materials Science">cond-mat.mtrl-sci</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Superconductivity">cond-mat.supr-con</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> AI-driven inverse design of materials: Past, present and future </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Han%2C+X">Xiao-Qi Han</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xin-De Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+M">Meng-Yuan Xu</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Z">Zhen Feng</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+B">Bo-Wen Yao</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+P">Peng-Jie Guo</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Ze-Feng Gao</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Z">Zhong-Yi Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09429v1-abstract-short" style="display: inline;"> The discovery of advanced materials is the cornerstone of human technological development and progress. The structures of materials and their corresponding properties are essentially the result of a complex interplay of multiple degrees of freedom such as lattice, charge, spin, symmetry, and topology. This poses significant challenges for the inverse design methods of materials. Humans have long e… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09429v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09429v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09429v1-abstract-full" style="display: none;"> The discovery of advanced materials is the cornerstone of human technological development and progress. The structures of materials and their corresponding properties are essentially the result of a complex interplay of multiple degrees of freedom such as lattice, charge, spin, symmetry, and topology. This poses significant challenges for the inverse design methods of materials. Humans have long explored new materials through a large number of experiments and proposed corresponding theoretical systems to predict new material properties and structures. With the improvement of computational power, researchers have gradually developed various electronic structure calculation methods, particularly such as the one based density functional theory, as well as high-throughput computational methods. Recently, the rapid development of artificial intelligence technology in the field of computer science has enabled the effective characterization of the implicit association between material properties and structures, thus opening up an efficient paradigm for the inverse design of functional materials. A significant progress has been made in inverse design of materials based on generative and discriminative models, attracting widespread attention from researchers. Considering this rapid technological progress, in this survey, we look back on the latest advancements in AI-driven inverse design of materials by introducing the background, key findings, and mainstream technological development routes. In addition, we summarize the remaining issues for future directions. This survey provides the latest overview of AI-driven inverse design of materials, which can serve as a useful resource for researchers. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09429v1-abstract-full').style.display = 'none'; document.getElementById('2411.09429v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">43 pages, 5 figures, 2 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04554">arXiv:2411.04554</a> <span> [<a href="https://arxiv.org/pdf/2411.04554">pdf</a>, <a href="https://arxiv.org/format/2411.04554">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Peri-midFormer: Periodic Pyramid Transformer for Time Series Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+Q">Qiang Wu</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+G">Gechang Yao</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Z">Zhixi Feng</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Shuyuan Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04554v1-abstract-short" style="display: inline;"> Time series analysis finds wide applications in fields such as weather forecasting, anomaly detection, and behavior recognition. Previous methods attempted to model temporal variations directly using 1D time series. However, this has been quite challenging due to the discrete nature of data points in time series and the complexity of periodic variation. In terms of periodicity, taking weather and… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04554v1-abstract-full').style.display = 'inline'; document.getElementById('2411.04554v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04554v1-abstract-full" style="display: none;"> Time series analysis finds wide applications in fields such as weather forecasting, anomaly detection, and behavior recognition. Previous methods attempted to model temporal variations directly using 1D time series. However, this has been quite challenging due to the discrete nature of data points in time series and the complexity of periodic variation. In terms of periodicity, taking weather and traffic data as an example, there are multi-periodic variations such as yearly, monthly, weekly, and daily, etc. In order to break through the limitations of the previous methods, we decouple the implied complex periodic variations into inclusion and overlap relationships among different level periodic components based on the observation of the multi-periodicity therein and its inclusion relationships. This explicitly represents the naturally occurring pyramid-like properties in time series, where the top level is the original time series and lower levels consist of periodic components with gradually shorter periods, which we call the periodic pyramid. To further extract complex temporal variations, we introduce self-attention mechanism into the periodic pyramid, capturing complex periodic relationships by computing attention between periodic components based on their inclusion, overlap, and adjacency relationships. Our proposed Peri-midFormer demonstrates outstanding performance in five mainstream time series analysis tasks, including short- and long-term forecasting, imputation, classification, and anomaly detection. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04554v1-abstract-full').style.display = 'none'; document.getElementById('2411.04554v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">38th Conference on Neural Information Processing Systems (NeurIPS 2024)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04491">arXiv:2411.04491</a> <span> [<a href="https://arxiv.org/pdf/2411.04491">pdf</a>, <a href="https://arxiv.org/format/2411.04491">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Series-to-Series Diffusion Bridge Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+H">Hao Yang</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Z">Zhanbo Feng</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+F">Feng Zhou</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+R+C">Robert C Qiu</a>, <a href="/search/cs?searchtype=author&query=Ling%2C+Z">Zenan Ling</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04491v1-abstract-short" style="display: inline;"> Diffusion models have risen to prominence in time series forecasting, showcasing their robust capability to model complex data distributions. However, their effectiveness in deterministic predictions is often constrained by instability arising from their inherent stochasticity. In this paper, we revisit time series diffusion models and present a comprehensive framework that encompasses most existi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04491v1-abstract-full').style.display = 'inline'; document.getElementById('2411.04491v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04491v1-abstract-full" style="display: none;"> Diffusion models have risen to prominence in time series forecasting, showcasing their robust capability to model complex data distributions. However, their effectiveness in deterministic predictions is often constrained by instability arising from their inherent stochasticity. In this paper, we revisit time series diffusion models and present a comprehensive framework that encompasses most existing diffusion-based methods. Building on this theoretical foundation, we propose a novel diffusion-based time series forecasting model, the Series-to-Series Diffusion Bridge Model ($\mathrm{S^2DBM}$), which leverages the Brownian Bridge process to reduce randomness in reverse estimations and improves accuracy by incorporating informative priors and conditions derived from historical time series data. Experimental results demonstrate that $\mathrm{S^2DBM}$ delivers superior performance in point-to-point forecasting and competes effectively with other diffusion-based models in probabilistic forecasting. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04491v1-abstract-full').style.display = 'none'; document.getElementById('2411.04491v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04399">arXiv:2411.04399</a> <span> [<a href="https://arxiv.org/pdf/2411.04399">pdf</a>, <a href="https://arxiv.org/format/2411.04399">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ProGraph: Temporally-alignable Probability Guided Graph Topological Modeling for 3D Human Reconstruction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hongsheng Wang</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Z">Zehui Feng</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+T">Tong Xiao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+G">Genfan Yang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shengyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+F">Fei Wu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+F">Feng Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04399v1-abstract-short" style="display: inline;"> Current 3D human motion reconstruction methods from monocular videos rely on features within the current reconstruction window, leading to distortion and deformations in the human structure under local occlusions or blurriness in video frames. To estimate realistic 3D human mesh sequences based on incomplete features, we propose Temporally-alignable Probability Guided Graph Topological Modeling fo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04399v1-abstract-full').style.display = 'inline'; document.getElementById('2411.04399v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04399v1-abstract-full" style="display: none;"> Current 3D human motion reconstruction methods from monocular videos rely on features within the current reconstruction window, leading to distortion and deformations in the human structure under local occlusions or blurriness in video frames. To estimate realistic 3D human mesh sequences based on incomplete features, we propose Temporally-alignable Probability Guided Graph Topological Modeling for 3D Human Reconstruction (ProGraph). For missing parts recovery, we exploit the explicit topological-aware probability distribution across the entire motion sequence. To restore the complete human, Graph Topological Modeling (GTM) learns the underlying topological structure, focusing on the relationships inherent in the individual parts. Next, to generate blurred motion parts, Temporal-alignable Probability Distribution (TPDist) utilizes the GTM to predict features based on distribution. This interactive mechanism facilitates motion consistency, allowing the restoration of human parts. Furthermore, Hierarchical Human Loss (HHLoss) constrains the probability distribution errors of inter-frame features during topological structure variation. Our Method achieves superior results than other SOTA methods in addressing occlusions and blurriness on 3DPW. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04399v1-abstract-full').style.display = 'none'; document.getElementById('2411.04399v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03274">arXiv:2411.03274</a> <span> [<a href="https://arxiv.org/pdf/2411.03274">pdf</a>, <a href="https://arxiv.org/format/2411.03274">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Discrete Mathematics">cs.DM</span> </div> </div> <p class="title is-5 mathjax"> Generalized Word-Representable Graphs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Feng%2C+Z">Zhidan Feng</a>, <a href="/search/cs?searchtype=author&query=Fernau%2C+H">Henning Fernau</a>, <a href="/search/cs?searchtype=author&query=Fleischmann%2C+P">Pamela Fleischmann</a>, <a href="/search/cs?searchtype=author&query=Mann%2C+K">Kevin Mann</a>, <a href="/search/cs?searchtype=author&query=Sacher%2C+S+C">Silas Cato Sacher</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03274v1-abstract-short" style="display: inline;"> The literature on word-representable graphs is quite rich, and a number of variations of the original definition have been proposed over the years. We are initiating a systematic study of such variations based on formal languages. In our framework, we can associate a graph class to each language over the binary alphabet \{0,1\}. All graph classes that are language-representable in this sense are h… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03274v1-abstract-full').style.display = 'inline'; document.getElementById('2411.03274v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03274v1-abstract-full" style="display: none;"> The literature on word-representable graphs is quite rich, and a number of variations of the original definition have been proposed over the years. We are initiating a systematic study of such variations based on formal languages. In our framework, we can associate a graph class to each language over the binary alphabet \{0,1\}. All graph classes that are language-representable in this sense are hereditary and enjoy further common properties. Besides word-representable graphs and, more generally, 1^k- or k-11-representable graphs, we can identify many more graph classes in our framework, like (co)bipartite graphs, (co)comparability graphs, to name a few. It was already known that any graph is 111- or 2-11-representable. When such representations are considered for storing graphs, 111- or 2-11-representability bears the disadvantage of being significantly inferior to standard adjacency matrices or lists. We prove that quite famous languages like the palindromes, the copy language or the Lyndon words can match the efficiency of standard graph representations. The perspective of language theory allows us to prove general results that hold for all graph classes that can be defined in this way. This includes certain closure properties (e.g., all language-definable graph classes are hereditary) as well as certain limitations (e.g., all language-representable graph classes contain graphs of arbitrarily large treewidth and of arbitrarily large degeneracy, except a trivial case). As each language describes a graph class, we can also ask decidability questions concerning graph classes, given a concrete presentation of a formal language. We also present a systematic study of graph classes that can be represented by languages in which each letter occurs at most twice. Here, we find graph classes like interval, permutation, circle, bipartite chain, convex, and threshold graphs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03274v1-abstract-full').style.display = 'none'; document.getElementById('2411.03274v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68Q45 05C99 68P30 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01601">arXiv:2411.01601</a> <span> [<a href="https://arxiv.org/pdf/2411.01601">pdf</a>, <a href="https://arxiv.org/format/2411.01601">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> Investigating the Impact of Interpersonal Challenges on Feeling Welcome in OSS </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Trinkenreich%2C+B">Bianca Trinkenreich</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Z">Zixuan Feng</a>, <a href="/search/cs?searchtype=author&query=Choudhuri%2C+R">Rudrajit Choudhuri</a>, <a href="/search/cs?searchtype=author&query=Gerosa%2C+M">Marco Gerosa</a>, <a href="/search/cs?searchtype=author&query=Sarma%2C+A">Anita Sarma</a>, <a href="/search/cs?searchtype=author&query=Steinmacher%2C+I">Igor Steinmacher</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01601v1-abstract-short" style="display: inline;"> The sustainability of open source software (OSS) projects hinges on contributor retention. Interpersonal challenges can inhibit a feeling of welcomeness among contributors, particularly from underrepresented groups, which impacts their decision to continue with the project. How much this impact is, varies among individuals, underlining the importance of a thorough understanding of their effects. H… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01601v1-abstract-full').style.display = 'inline'; document.getElementById('2411.01601v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01601v1-abstract-full" style="display: none;"> The sustainability of open source software (OSS) projects hinges on contributor retention. Interpersonal challenges can inhibit a feeling of welcomeness among contributors, particularly from underrepresented groups, which impacts their decision to continue with the project. How much this impact is, varies among individuals, underlining the importance of a thorough understanding of their effects. Here, we investigate the effects of interpersonal challenges on the sense of welcomeness among diverse populations within OSS, through the diversity lenses of gender, race, and (dis)ability. We analyzed the large-scale Linux Foundation Diversity and Inclusion survey (n = 706) to model a theoretical framework linking interpersonal challenges with the sense of welcomeness through Structural Equation Models Partial Least Squares (PLS-SEM). We then examine the model to identify the impact of these challenges on different demographics through Multi-Group Analysis (MGA). Finally, we conducted a regression analysis to investigate how differently people from different demographics experience different types of interpersonal challenges. Our findings confirm the negative association between interpersonal challenges and the feeling of welcomeness in OSS, with this relationship being more pronounced among gender minorities and people with disabilities. We found that different challenges have unique impacts on how people feel welcomed, with variations across gender, race, and disability groups. We also provide evidence that people from gender minorities and with disabilities are more likely to experience interpersonal challenges than their counterparts, especially when we analyze stalking, sexual harassment, and doxxing. Our insights benefit OSS communities, informing potential strategies to improve the landscape of interpersonal relationships, ultimately fostering more inclusive and welcoming communities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01601v1-abstract-full').style.display = 'none'; document.getElementById('2411.01601v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by 47th IEEE/ACM International Conference on Software Engineering (ICSE 2025)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00801">arXiv:2411.00801</a> <span> [<a href="https://arxiv.org/pdf/2411.00801">pdf</a>, <a href="https://arxiv.org/format/2411.00801">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Biomolecules">q-bio.BM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> A Heterogeneous Network-based Contrastive Learning Approach for Predicting Drug-Target Interaction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+J">Junwei Hu</a>, <a href="/search/cs?searchtype=author&query=Bewong%2C+M">Michael Bewong</a>, <a href="/search/cs?searchtype=author&query=Kwashie%2C+S">Selasi Kwashie</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wen Zhang</a>, <a href="/search/cs?searchtype=author&query=Nofong%2C+V+M">Vincent M. Nofong</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+G">Guangsheng Wu</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Z">Zaiwen Feng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00801v1-abstract-short" style="display: inline;"> Drug-target interaction (DTI) prediction is crucial for drug development and repositioning. Methods using heterogeneous graph neural networks (HGNNs) for DTI prediction have become a promising approach, with attention-based models often achieving excellent performance. However, these methods typically overlook edge features when dealing with heterogeneous biomedical networks. We propose a heteroge… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00801v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00801v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00801v1-abstract-full" style="display: none;"> Drug-target interaction (DTI) prediction is crucial for drug development and repositioning. Methods using heterogeneous graph neural networks (HGNNs) for DTI prediction have become a promising approach, with attention-based models often achieving excellent performance. However, these methods typically overlook edge features when dealing with heterogeneous biomedical networks. We propose a heterogeneous network-based contrastive learning method called HNCL-DTI, which designs a heterogeneous graph attention network to predict potential/novel DTIs. Specifically, our HNCL-DTI utilizes contrastive learning to collaboratively learn node representations from the perspective of both node-based and edge-based attention within the heterogeneous structure of biomedical networks. Experimental results show that HNCL-DTI outperforms existing advanced baseline methods on benchmark datasets, demonstrating strong predictive ability and practical effectiveness. The data and source code are available at https://github.com/Zaiwen/HNCL-DTI. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00801v1-abstract-full').style.display = 'none'; document.getElementById('2411.00801v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00593">arXiv:2411.00593</a> <span> [<a href="https://arxiv.org/pdf/2411.00593">pdf</a>, <a href="https://arxiv.org/format/2411.00593">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Adapting Language Models via Token Translation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Feng%2C+Z">Zhili Feng</a>, <a href="/search/cs?searchtype=author&query=Marwah%2C+T">Tanya Marwah</a>, <a href="/search/cs?searchtype=author&query=Fusi%2C+N">Nicolo Fusi</a>, <a href="/search/cs?searchtype=author&query=Alvarez-Melis%2C+D">David Alvarez-Melis</a>, <a href="/search/cs?searchtype=author&query=Mackey%2C+L">Lester Mackey</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00593v2-abstract-short" style="display: inline;"> Modern large language models use a fixed tokenizer to effectively compress text drawn from a source domain. However, applying the same tokenizer to a new target domain often leads to inferior compression, more costly inference, and reduced semantic alignment. To address this deficiency, we introduce Sparse Sinkhorn Token Translation (S2T2). S2T2 trains a tailored tokenizer for the target domain an… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00593v2-abstract-full').style.display = 'inline'; document.getElementById('2411.00593v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00593v2-abstract-full" style="display: none;"> Modern large language models use a fixed tokenizer to effectively compress text drawn from a source domain. However, applying the same tokenizer to a new target domain often leads to inferior compression, more costly inference, and reduced semantic alignment. To address this deficiency, we introduce Sparse Sinkhorn Token Translation (S2T2). S2T2 trains a tailored tokenizer for the target domain and learns to translate between target and source tokens, enabling more effective reuse of the pre-trained next-source-token predictor. In our experiments with finetuned English language models, S2T2 improves both the perplexity and the compression of out-of-domain protein sequences, outperforming direct finetuning with either the source or target tokenizer. In addition, we find that token translations learned for smaller, less expensive models can be directly transferred to larger, more powerful models to reap the benefits of S2T2 at lower cost. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00593v2-abstract-full').style.display = 'none'; document.getElementById('2411.00593v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.22832">arXiv:2410.22832</a> <span> [<a href="https://arxiv.org/pdf/2410.22832">pdf</a>, <a href="https://arxiv.org/format/2410.22832">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> HijackRAG: Hijacking Attacks against Retrieval-Augmented Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yucheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Q">Qinfeng Li</a>, <a href="/search/cs?searchtype=author&query=Du%2C+T">Tianyu Du</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xuhong Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+X">Xinkui Zhao</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Z">Zhengwen Feng</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+J">Jianwei Yin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.22832v1-abstract-short" style="display: inline;"> Retrieval-Augmented Generation (RAG) systems enhance large language models (LLMs) by integrating external knowledge, making them adaptable and cost-effective for various applications. However, the growing reliance on these systems also introduces potential security risks. In this work, we reveal a novel vulnerability, the retrieval prompt hijack attack (HijackRAG), which enables attackers to manip… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22832v1-abstract-full').style.display = 'inline'; document.getElementById('2410.22832v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.22832v1-abstract-full" style="display: none;"> Retrieval-Augmented Generation (RAG) systems enhance large language models (LLMs) by integrating external knowledge, making them adaptable and cost-effective for various applications. However, the growing reliance on these systems also introduces potential security risks. In this work, we reveal a novel vulnerability, the retrieval prompt hijack attack (HijackRAG), which enables attackers to manipulate the retrieval mechanisms of RAG systems by injecting malicious texts into the knowledge database. When the RAG system encounters target questions, it generates the attacker's pre-determined answers instead of the correct ones, undermining the integrity and trustworthiness of the system. We formalize HijackRAG as an optimization problem and propose both black-box and white-box attack strategies tailored to different levels of the attacker's knowledge. Extensive experiments on multiple benchmark datasets show that HijackRAG consistently achieves high attack success rates, outperforming existing baseline attacks. Furthermore, we demonstrate that the attack is transferable across different retriever models, underscoring the widespread risk it poses to RAG systems. Lastly, our exploration of various defense mechanisms reveals that they are insufficient to counter HijackRAG, emphasizing the urgent need for more robust security measures to protect RAG systems in real-world deployments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22832v1-abstract-full').style.display = 'none'; document.getElementById('2410.22832v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.22818">arXiv:2410.22818</a> <span> [<a href="https://arxiv.org/pdf/2410.22818">pdf</a>, <a href="https://arxiv.org/format/2410.22818">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> A test-free semantic mistakes localization framework in Neural Code Translation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+L">Lei Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Sai Zhang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+F">Fangzhou Xu</a>, <a href="/search/cs?searchtype=author&query=Xing%2C+Z">Zhenchang Xing</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+L">Liang Wan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiaowang Zhang</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Z">Zhiyong Feng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.22818v1-abstract-short" style="display: inline;"> In the task of code translation, neural network-based models have been shown to frequently produce semantically erroneous code that deviates from the original logic of the source code. This issue persists even with advanced large models. Although a recent approach proposed using test cases to identify these semantic errors, it relies heavily on the quality of the test cases and is not applicable t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22818v1-abstract-full').style.display = 'inline'; document.getElementById('2410.22818v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.22818v1-abstract-full" style="display: none;"> In the task of code translation, neural network-based models have been shown to frequently produce semantically erroneous code that deviates from the original logic of the source code. This issue persists even with advanced large models. Although a recent approach proposed using test cases to identify these semantic errors, it relies heavily on the quality of the test cases and is not applicable to code snippets without test cases in real-world scenarios. Therefore, We present EISP, a static analysis framework based on the Large Language Model (LLM).First, the framework generates a semantic mapping between source code and translated code. Next, each sub-code fragment is identified by recursively traversing the abstract syntax tree of the source code, and its corresponding translated code fragment is found through the semantic mapping. Finally, EISP connects each pair of sub-code fragments with fine-grained knowledge hints through an AI chain to assist LLMs in discovering semantic mistakes in the translated code. In our benchmark evaluation, the EISP framework, based on GPT-4o mini, achieved an accuracy of 82.3\%, representing a 20.3\% improvement over baseline methods using the same base model, and a 7.4\% improvement compared to dynamic analysis methods that require test cases and manual intervention. To our knowledge, EISP is the first tool to locate semantic errors in translated code without test cases or compilable code. This innovative tool provides the software engineering community with a new way to deal with code fragments without test cases. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22818v1-abstract-full').style.display = 'none'; document.getElementById('2410.22818v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18200">arXiv:2410.18200</a> <span> [<a href="https://arxiv.org/pdf/2410.18200">pdf</a>, <a href="https://arxiv.org/format/2410.18200">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Rethinking Positive Pairs in Contrastive Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jiantao Wu</a>, <a href="/search/cs?searchtype=author&query=Mo%2C+S">Shentong Mo</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Z">Zhenhua Feng</a>, <a href="/search/cs?searchtype=author&query=Atito%2C+S">Sara Atito</a>, <a href="/search/cs?searchtype=author&query=Kitler%2C+J">Josef Kitler</a>, <a href="/search/cs?searchtype=author&query=Awais%2C+M">Muhammad Awais</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18200v1-abstract-short" style="display: inline;"> Contrastive learning, a prominent approach to representation learning, traditionally assumes positive pairs are closely related samples (the same image or class) and negative pairs are distinct samples. We challenge this assumption by proposing to learn from arbitrary pairs, allowing any pair of samples to be positive within our framework.The primary challenge of the proposed approach lies in appl… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18200v1-abstract-full').style.display = 'inline'; document.getElementById('2410.18200v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18200v1-abstract-full" style="display: none;"> Contrastive learning, a prominent approach to representation learning, traditionally assumes positive pairs are closely related samples (the same image or class) and negative pairs are distinct samples. We challenge this assumption by proposing to learn from arbitrary pairs, allowing any pair of samples to be positive within our framework.The primary challenge of the proposed approach lies in applying contrastive learning to disparate pairs which are semantically distant. Motivated by the discovery that SimCLR can separate given arbitrary pairs (e.g., garter snake and table lamp) in a subspace, we propose a feature filter in the condition of class pairs that creates the requisite subspaces by gate vectors selectively activating or deactivating dimensions. This filter can be optimized through gradient descent within a conventional contrastive learning mechanism. We present Hydra, a universal contrastive learning framework for visual representations that extends conventional contrastive learning to accommodate arbitrary pairs. Our approach is validated using IN1K, where 1K diverse classes compose 500,500 pairs, most of them being distinct. Surprisingly, Hydra achieves superior performance in this challenging setting. Additional benefits include the prevention of dimensional collapse and the discovery of class relationships. Our work highlights the value of learning common features of arbitrary pairs and potentially broadens the applicability of contrastive learning techniques on the sample pairs with weak relationships. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18200v1-abstract-full').style.display = 'none'; document.getElementById('2410.18200v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.16642">arXiv:2410.16642</a> <span> [<a href="https://arxiv.org/pdf/2410.16642">pdf</a>, <a href="https://arxiv.org/format/2410.16642">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Fire and Smoke Detection with Burning Intensity Representation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Han%2C+X">Xiaoyi Han</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yanfei Wu</a>, <a href="/search/cs?searchtype=author&query=Pu%2C+N">Nan Pu</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Z">Zunlei Feng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Qifei Zhang</a>, <a href="/search/cs?searchtype=author&query=Bei%2C+Y">Yijun Bei</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+L">Lechao Cheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.16642v1-abstract-short" style="display: inline;"> An effective Fire and Smoke Detection (FSD) and analysis system is of paramount importance due to the destructive potential of fire disasters. However, many existing FSD methods directly employ generic object detection techniques without considering the transparency of fire and smoke, which leads to imprecise localization and reduces detection performance. To address this issue, a new Attentive Fi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16642v1-abstract-full').style.display = 'inline'; document.getElementById('2410.16642v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.16642v1-abstract-full" style="display: none;"> An effective Fire and Smoke Detection (FSD) and analysis system is of paramount importance due to the destructive potential of fire disasters. However, many existing FSD methods directly employ generic object detection techniques without considering the transparency of fire and smoke, which leads to imprecise localization and reduces detection performance. To address this issue, a new Attentive Fire and Smoke Detection Model (a-FSDM) is proposed. This model not only retains the robust feature extraction and fusion capabilities of conventional detection algorithms but also redesigns the detection head specifically for transparent targets in FSD, termed the Attentive Transparency Detection Head (ATDH). In addition, Burning Intensity (BI) is introduced as a pivotal feature for fire-related downstream risk assessments in traditional FSD methodologies. Extensive experiments on multiple FSD datasets showcase the effectiveness and versatility of the proposed FSD model. The project is available at \href{https://xiaoyihan6.github.io/FSD/}{https://xiaoyihan6.github.io/FSD/}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16642v1-abstract-full').style.display = 'none'; document.getElementById('2410.16642v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.16631">arXiv:2410.16631</a> <span> [<a href="https://arxiv.org/pdf/2410.16631">pdf</a>, <a href="https://arxiv.org/format/2410.16631">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Benchmarking Multi-Scene Fire and Smoke Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Han%2C+X">Xiaoyi Han</a>, <a href="/search/cs?searchtype=author&query=Pu%2C+N">Nan Pu</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Z">Zunlei Feng</a>, <a href="/search/cs?searchtype=author&query=Bei%2C+Y">Yijun Bei</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Qifei Zhang</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+L">Lechao Cheng</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+L">Liang Xue</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.16631v1-abstract-short" style="display: inline;"> The current irregularities in existing public Fire and Smoke Detection (FSD) datasets have become a bottleneck in the advancement of FSD technology. Upon in-depth analysis, we identify the core issue as the lack of standardized dataset construction, uniform evaluation systems, and clear performance benchmarks. To address this issue and drive innovation in FSD technology, we systematically gather d… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16631v1-abstract-full').style.display = 'inline'; document.getElementById('2410.16631v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.16631v1-abstract-full" style="display: none;"> The current irregularities in existing public Fire and Smoke Detection (FSD) datasets have become a bottleneck in the advancement of FSD technology. Upon in-depth analysis, we identify the core issue as the lack of standardized dataset construction, uniform evaluation systems, and clear performance benchmarks. To address this issue and drive innovation in FSD technology, we systematically gather diverse resources from public sources to create a more comprehensive and refined FSD benchmark. Additionally, recognizing the inadequate coverage of existing dataset scenes, we strategically expand scenes, relabel, and standardize existing public FSD datasets to ensure accuracy and consistency. We aim to establish a standardized, realistic, unified, and efficient FSD research platform that mirrors real-life scenes closely. Through our efforts, we aim to provide robust support for the breakthrough and development of FSD technology. The project is available at \href{https://xiaoyihan6.github.io/FSD/}{https://xiaoyihan6.github.io/FSD/}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16631v1-abstract-full').style.display = 'none'; document.getElementById('2410.16631v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.16080">arXiv:2410.16080</a> <span> [<a href="https://arxiv.org/pdf/2410.16080">pdf</a>, <a href="https://arxiv.org/format/2410.16080">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Unleashing the Potential of Multi-Channel Fusion in Retrieval for Personalized Recommendations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+J">Junjie Huang</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+J">Jiarui Qin</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+J">Jianghao Lin</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Z">Ziming Feng</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Y">Yong Yu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Weinan Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.16080v1-abstract-short" style="display: inline;"> Recommender systems (RS) are pivotal in managing information overload in modern digital services. A key challenge in RS is efficiently processing vast item pools to deliver highly personalized recommendations under strict latency constraints. Multi-stage cascade ranking addresses this by employing computationally efficient retrieval methods to cover diverse user interests, followed by more precise… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16080v1-abstract-full').style.display = 'inline'; document.getElementById('2410.16080v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.16080v1-abstract-full" style="display: none;"> Recommender systems (RS) are pivotal in managing information overload in modern digital services. A key challenge in RS is efficiently processing vast item pools to deliver highly personalized recommendations under strict latency constraints. Multi-stage cascade ranking addresses this by employing computationally efficient retrieval methods to cover diverse user interests, followed by more precise ranking models to refine the results. In the retrieval stage, multi-channel retrieval is often used to generate distinct item subsets from different candidate generators, leveraging the complementary strengths of these methods to maximize coverage. However, forwarding all retrieved items overwhelms downstream rankers, necessitating truncation. Despite advancements in individual retrieval methods, multi-channel fusion, the process of efficiently merging multi-channel retrieval results, remains underexplored. We are the first to identify and systematically investigate multi-channel fusion in the retrieval stage. Current industry practices often rely on heuristic approaches and manual designs, which often lead to suboptimal performance. Moreover, traditional gradient-based methods like SGD are unsuitable for this task due to the non-differentiable nature of the selection process. In this paper, we explore advanced channel fusion strategies by assigning systematically optimized weights to each channel. We utilize black-box optimization techniques, including the Cross Entropy Method and Bayesian Optimization for global weight optimization, alongside policy gradient-based approaches for personalized merging. Our methods enhance both personalization and flexibility, achieving significant performance improvements across multiple datasets and yielding substantial gains in real-world deployments, offering a scalable solution for optimizing multi-channel fusion in retrieval. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16080v1-abstract-full').style.display = 'none'; document.getElementById('2410.16080v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 8 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.14332">arXiv:2410.14332</a> <span> [<a href="https://arxiv.org/pdf/2410.14332">pdf</a>, <a href="https://arxiv.org/format/2410.14332">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Croc: Pretraining Large Multimodal Models with Cross-Modal Comprehension </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xie%2C+Y">Yin Xie</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+K">Kaicheng Yang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+N">Ninghua Yang</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+W">Weimo Deng</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+X">Xiangzi Dai</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+T">Tiancheng Gu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yumeng Wang</a>, <a href="/search/cs?searchtype=author&query=An%2C+X">Xiang An</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yongle Zhao</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Z">Ziyong Feng</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+J">Jiankang Deng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.14332v1-abstract-short" style="display: inline;"> Recent advances in Large Language Models (LLMs) have catalyzed the development of Large Multimodal Models (LMMs). However, existing research primarily focuses on tuning language and image instructions, ignoring the critical pretraining phase where models learn to process textual and visual modalities jointly. In this paper, we propose a new pretraining paradigm for LMMs to enhance the visual compr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14332v1-abstract-full').style.display = 'inline'; document.getElementById('2410.14332v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.14332v1-abstract-full" style="display: none;"> Recent advances in Large Language Models (LLMs) have catalyzed the development of Large Multimodal Models (LMMs). However, existing research primarily focuses on tuning language and image instructions, ignoring the critical pretraining phase where models learn to process textual and visual modalities jointly. In this paper, we propose a new pretraining paradigm for LMMs to enhance the visual comprehension capabilities of LLMs by introducing a novel cross-modal comprehension stage. Specifically, we design a dynamically learnable prompt token pool and employ the Hungarian algorithm to replace part of the original visual tokens with the most relevant prompt tokens. Then, we conceptualize visual tokens as analogous to a "foreign language" for the LLMs and propose a mixed attention mechanism with bidirectional visual attention and unidirectional textual attention to comprehensively enhance the understanding of visual tokens. Meanwhile, we integrate a detailed caption generation task, leveraging rich descriptions to further facilitate LLMs in understanding visual semantic information. After pretraining on 1.5 million publicly accessible data, we present a new foundation model called Croc. Experimental results demonstrate that Croc achieves new state-of-the-art performance on massive vision-language benchmarks. To support reproducibility and facilitate further research, we release the training code and pre-trained model weights at https://github.com/deepglint/Croc. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14332v1-abstract-full').style.display = 'none'; document.getElementById('2410.14332v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">18 pages, 11 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.11820">arXiv:2410.11820</a> <span> [<a href="https://arxiv.org/pdf/2410.11820">pdf</a>, <a href="https://arxiv.org/format/2410.11820">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Adaptive Data Optimization: Dynamic Sample Selection with Scaling Laws </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+Y">Yiding Jiang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+A">Allan Zhou</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Z">Zhili Feng</a>, <a href="/search/cs?searchtype=author&query=Malladi%2C+S">Sadhika Malladi</a>, <a href="/search/cs?searchtype=author&query=Kolter%2C+J+Z">J. Zico Kolter</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.11820v1-abstract-short" style="display: inline;"> The composition of pretraining data is a key determinant of foundation models' performance, but there is no standard guideline for allocating a limited computational budget across different data sources. Most current approaches either rely on extensive experiments with smaller models or dynamic data adjustments that also require proxy models, both of which significantly increase the workflow compl… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11820v1-abstract-full').style.display = 'inline'; document.getElementById('2410.11820v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.11820v1-abstract-full" style="display: none;"> The composition of pretraining data is a key determinant of foundation models' performance, but there is no standard guideline for allocating a limited computational budget across different data sources. Most current approaches either rely on extensive experiments with smaller models or dynamic data adjustments that also require proxy models, both of which significantly increase the workflow complexity and computational overhead. In this paper, we introduce Adaptive Data Optimization (ADO), an algorithm that optimizes data distributions in an online fashion, concurrent with model training. Unlike existing techniques, ADO does not require external knowledge, proxy models, or modifications to the model update. Instead, ADO uses per-domain scaling laws to estimate the learning potential of each domain during training and adjusts the data mixture accordingly, making it more scalable and easier to integrate. Experiments demonstrate that ADO can achieve comparable or better performance than prior methods while maintaining computational efficiency across different computation scales, offering a practical solution for dynamically adjusting data distribution without sacrificing flexibility or increasing costs. Beyond its practical benefits, ADO also provides a new perspective on data collection strategies via scaling laws. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11820v1-abstract-full').style.display = 'none'; document.getElementById('2410.11820v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">21 pages, 10 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.10875">arXiv:2410.10875</a> <span> [<a href="https://arxiv.org/pdf/2410.10875">pdf</a>, <a href="https://arxiv.org/format/2410.10875">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> SHyPar: A Spectral Coarsening Approach to Hypergraph Partitioning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sajadinia%2C+H">Hamed Sajadinia</a>, <a href="/search/cs?searchtype=author&query=Aghdaei%2C+A">Ali Aghdaei</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Z">Zhuo Feng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.10875v2-abstract-short" style="display: inline;"> State-of-the-art hypergraph partitioners utilize a multilevel paradigm to construct progressively coarser hypergraphs across multiple layers, guiding cut refinements at each level of the hierarchy. Traditionally, these partitioners employ heuristic methods for coarsening and do not consider the structural features of hypergraphs. In this work, we introduce a multilevel spectral framework, SHyPar,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10875v2-abstract-full').style.display = 'inline'; document.getElementById('2410.10875v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.10875v2-abstract-full" style="display: none;"> State-of-the-art hypergraph partitioners utilize a multilevel paradigm to construct progressively coarser hypergraphs across multiple layers, guiding cut refinements at each level of the hierarchy. Traditionally, these partitioners employ heuristic methods for coarsening and do not consider the structural features of hypergraphs. In this work, we introduce a multilevel spectral framework, SHyPar, for partitioning large-scale hypergraphs by leveraging hyperedge effective resistances and flow-based community detection techniques. Inspired by the latest theoretical spectral clustering frameworks, such as HyperEF and HyperSF, SHyPar aims to decompose large hypergraphs into multiple subgraphs with few inter-partition hyperedges (cut size). A key component of SHyPar is a flow-based local clustering scheme for hypergraph coarsening, which incorporates a max-flow-based algorithm to produce clusters with substantially improved conductance. Additionally, SHyPar utilizes an effective resistance-based rating function for merging nodes that are strongly connected (coupled). Compared with existing state-of-the-art hypergraph partitioning methods, our extensive experimental results on real-world VLSI designs demonstrate that SHyPar can more effectively partition hypergraphs, achieving state-of-the-art solution quality. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10875v2-abstract-full').style.display = 'none'; document.getElementById('2410.10875v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">14 pages, 11 figures, 4 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.10315">arXiv:2410.10315</a> <span> [<a href="https://arxiv.org/pdf/2410.10315">pdf</a>, <a href="https://arxiv.org/format/2410.10315">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> EasyRAG: Efficient Retrieval-Augmented Generation Framework for Automated Network Operations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Feng%2C+Z">Zhangchi Feng</a>, <a href="/search/cs?searchtype=author&query=Kuang%2C+D">Dongdong Kuang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhongyuan Wang</a>, <a href="/search/cs?searchtype=author&query=Nie%2C+Z">Zhijie Nie</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Y">Yaowei Zheng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Richong Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.10315v2-abstract-short" style="display: inline;"> This paper presents EasyRAG, a simple, lightweight, and efficient retrieval-augmented generation framework for automated network operations. Our framework has three advantages. The first is accurate question answering. We designed a straightforward RAG scheme based on (1) a specific data processing workflow (2) dual-route sparse retrieval for coarse ranking (3) LLM Reranker for reranking (4) LLM a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10315v2-abstract-full').style.display = 'inline'; document.getElementById('2410.10315v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.10315v2-abstract-full" style="display: none;"> This paper presents EasyRAG, a simple, lightweight, and efficient retrieval-augmented generation framework for automated network operations. Our framework has three advantages. The first is accurate question answering. We designed a straightforward RAG scheme based on (1) a specific data processing workflow (2) dual-route sparse retrieval for coarse ranking (3) LLM Reranker for reranking (4) LLM answer generation and optimization. This approach achieved first place in the GLM4 track in the preliminary round and second place in the GLM4 track in the semifinals. The second is simple deployment. Our method primarily consists of BM25 retrieval and BGE-reranker reranking, requiring no fine-tuning of any models, occupying minimal VRAM, easy to deploy, and highly scalable; we provide a flexible code library with various search and generation strategies, facilitating custom process implementation. The last one is efficient inference. We designed an efficient inference acceleration scheme for the entire coarse ranking, reranking, and generation process that significantly reduces the inference latency of RAG while maintaining a good level of accuracy; each acceleration scheme can be plug-and-play into any component of the RAG process, consistently enhancing the efficiency of the RAG system. Our code and data are released at \url{https://github.com/BUAADreamer/EasyRAG}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10315v2-abstract-full').style.display = 'none'; document.getElementById('2410.10315v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 2 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.08568">arXiv:2410.08568</a> <span> [<a href="https://arxiv.org/pdf/2410.08568">pdf</a>, <a href="https://arxiv.org/format/2410.08568">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Geophysics">physics.geo-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> GPR Full-Waveform Inversion through Adaptive Filtering of Model Parameters and Gradients Using CNN </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+P">Peng Jiang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+K">Kun Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jiaxing Wang</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Z">Zeliang Feng</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+S">Shengjie Qiao</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+R">Runhuai Deng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+F">Fengkai Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.08568v1-abstract-short" style="display: inline;"> GPR full-waveform inversion optimizes the subsurface property model iteratively to match the entire waveform information. However, the model gradients derived from wavefield continuation often contain errors, such as ghost values and excessively large values at transmitter and receiver points. Furthermore, models updated based on these gradients frequently exhibit unclear characterization of anoma… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08568v1-abstract-full').style.display = 'inline'; document.getElementById('2410.08568v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.08568v1-abstract-full" style="display: none;"> GPR full-waveform inversion optimizes the subsurface property model iteratively to match the entire waveform information. However, the model gradients derived from wavefield continuation often contain errors, such as ghost values and excessively large values at transmitter and receiver points. Furthermore, models updated based on these gradients frequently exhibit unclear characterization of anomalous bodies or false anomalies, making it challenging to obtain accurate inversion results. To address these issues, we introduced a novel full-waveform inversion (FWI) framework that incorporates an embedded convolutional neural network (CNN) to adaptively filter model parameters and gradients. Specifically, we embedded the CNN module before the forward modeling process and ensured the entire FWI process remains differentiable. This design leverages the auto-grad tool of the deep learning library, allowing model values to pass through the CNN module during forward computation and model gradients to pass through the CNN module during backpropagation. Experiments have shown that filtering the model parameters during forward computation and the model gradients during backpropagation can ultimately yield high-quality inversion results. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08568v1-abstract-full').style.display = 'none'; document.getElementById('2410.08568v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">16 pages, 6 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 86A22 (Primary) 86A20; 68T07 (Secondary) <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.8; J.2 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.08524">arXiv:2410.08524</a> <span> [<a href="https://arxiv.org/pdf/2410.08524">pdf</a>, <a href="https://arxiv.org/format/2410.08524">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> IGNN-Solver: A Graph Neural Solver for Implicit Graph Neural Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+J">Junchao Lin</a>, <a href="/search/cs?searchtype=author&query=Ling%2C+Z">Zenan Ling</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Z">Zhanbo Feng</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+F">Feng Zhou</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jingwen Xu</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+R+C">Robert C Qiu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.08524v1-abstract-short" style="display: inline;"> Implicit graph neural networks (IGNNs), which exhibit strong expressive power with a single layer, have recently demonstrated remarkable performance in capturing long-range dependencies (LRD) in underlying graphs while effectively mitigating the over-smoothing problem. However, IGNNs rely on computationally expensive fixed-point iterations, which lead to significant speed and scalability limitatio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08524v1-abstract-full').style.display = 'inline'; document.getElementById('2410.08524v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.08524v1-abstract-full" style="display: none;"> Implicit graph neural networks (IGNNs), which exhibit strong expressive power with a single layer, have recently demonstrated remarkable performance in capturing long-range dependencies (LRD) in underlying graphs while effectively mitigating the over-smoothing problem. However, IGNNs rely on computationally expensive fixed-point iterations, which lead to significant speed and scalability limitations, hindering their application to large-scale graphs. To achieve fast fixed-point solving for IGNNs, we propose a novel graph neural solver, IGNN-Solver, which leverages the generalized Anderson Acceleration method, parameterized by a small GNN, and learns iterative updates as a graph-dependent temporal process. Extensive experiments demonstrate that the IGNN-Solver significantly accelerates inference, achieving a $1.5\times$ to $8\times$ speedup without sacrificing accuracy. Moreover, this advantage becomes increasingly pronounced as the graph scale grows, facilitating its large-scale deployment in real-world applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08524v1-abstract-full').style.display = 'none'; document.getElementById('2410.08524v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.07876">arXiv:2410.07876</a> <span> [<a href="https://arxiv.org/pdf/2410.07876">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> FDDM: Frequency-Decomposed Diffusion Model for Rectum Cancer Dose Prediction in Radiotherapy </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liao%2C+X">Xin Liao</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Z">Zhenghao Feng</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+J">Jianghong Xiao</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+X">Xingchen Peng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yan Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.07876v1-abstract-short" style="display: inline;"> Accurate dose distribution prediction is crucial in the radiotherapy planning. Although previous methods based on convolutional neural network have shown promising performance, they have the problem of over-smoothing, leading to prediction without important high-frequency details. Recently, diffusion model has achieved great success in computer vision, which excels in generating images with more h… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07876v1-abstract-full').style.display = 'inline'; document.getElementById('2410.07876v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.07876v1-abstract-full" style="display: none;"> Accurate dose distribution prediction is crucial in the radiotherapy planning. Although previous methods based on convolutional neural network have shown promising performance, they have the problem of over-smoothing, leading to prediction without important high-frequency details. Recently, diffusion model has achieved great success in computer vision, which excels in generating images with more high-frequency details, yet suffers from time-consuming and extensive computational resource consumption. To alleviate these problems, we propose Frequency-Decomposed Diffusion Model (FDDM) that refines the high-frequency subbands of the dose map. To be specific, we design a Coarse Dose Prediction Module (CDPM) to first predict a coarse dose map and then utilize discrete wavelet transform to decompose the coarse dose map into a low-frequency subband and three high-frequency subbands. There is a notable difference between the coarse predicted results and ground truth in high-frequency subbands. Therefore, we design a diffusion-based module called High-Frequency Refinement Module (HFRM) that performs diffusion operation in the high-frequency components of the dose map instead of the original dose map. Extensive experiments on an in-house dataset verify the effectiveness of our approach. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07876v1-abstract-full').style.display = 'none'; document.getElementById('2410.07876v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.04783">arXiv:2410.04783</a> <span> [<a href="https://arxiv.org/pdf/2410.04783">pdf</a>, <a href="https://arxiv.org/format/2410.04783">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> </div> </div> <p class="title is-5 mathjax"> When GDD meets GNN: A Knowledge-driven Neural Connection for Effective Entity Resolution in Property Graphs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+J">Junwei Hu</a>, <a href="/search/cs?searchtype=author&query=Bewong%2C+M">Michael Bewong</a>, <a href="/search/cs?searchtype=author&query=Kwashie%2C+S">Selasi Kwashie</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yidi Zhang</a>, <a href="/search/cs?searchtype=author&query=Nofong%2C+V">Vincent Nofong</a>, <a href="/search/cs?searchtype=author&query=Wondoh%2C+J">John Wondoh</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Z">Zaiwen Feng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.04783v1-abstract-short" style="display: inline;"> This paper studies the entity resolution (ER) problem in property graphs. ER is the task of identifying and linking different records that refer to the same real-world entity. It is commonly used in data integration, data cleansing, and other applications where it is important to have accurate and consistent data. In general, two predominant approaches exist in the literature: rule-based and learn… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04783v1-abstract-full').style.display = 'inline'; document.getElementById('2410.04783v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.04783v1-abstract-full" style="display: none;"> This paper studies the entity resolution (ER) problem in property graphs. ER is the task of identifying and linking different records that refer to the same real-world entity. It is commonly used in data integration, data cleansing, and other applications where it is important to have accurate and consistent data. In general, two predominant approaches exist in the literature: rule-based and learning-based methods. On the one hand, rule-based techniques are often desired due to their explainability and ability to encode domain knowledge. Learning-based methods, on the other hand, are preferred due to their effectiveness in spite of their black-box nature. In this work, we devise a hybrid ER solution, GraphER, that leverages the strengths of both systems for property graphs. In particular, we adopt graph differential dependency (GDD) for encoding the so-called record-matching rules, and employ them to guide a graph neural network (GNN) based representation learning for the task. We conduct extensive empirical evaluation of our proposal on benchmark ER datasets including 17 graph datasets and 7 relational datasets in comparison with 10 state-of-the-art (SOTA) techniques. The results show that our approach provides a significantly better solution to addressing ER in graph data, both quantitatively and qualitatively, while attaining highly competitive results on the benchmark relational datasets w.r.t. the SOTA solutions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04783v1-abstract-full').style.display = 'none'; document.getElementById('2410.04783v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.02646">arXiv:2410.02646</a> <span> [<a href="https://arxiv.org/pdf/2410.02646">pdf</a>, <a href="https://arxiv.org/format/2410.02646">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Learning 3D Perception from Others' Predictions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yoo%2C+J">Jinsu Yoo</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Z">Zhenyang Feng</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+T">Tai-Yu Pan</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yihong Sun</a>, <a href="/search/cs?searchtype=author&query=Phoo%2C+C+P">Cheng Perng Phoo</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xiangyu Chen</a>, <a href="/search/cs?searchtype=author&query=Campbell%2C+M">Mark Campbell</a>, <a href="/search/cs?searchtype=author&query=Weinberger%2C+K+Q">Kilian Q. Weinberger</a>, <a href="/search/cs?searchtype=author&query=Hariharan%2C+B">Bharath Hariharan</a>, <a href="/search/cs?searchtype=author&query=Chao%2C+W">Wei-Lun Chao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.02646v2-abstract-short" style="display: inline;"> Accurate 3D object detection in real-world environments requires a huge amount of annotated data with high quality. Acquiring such data is tedious and expensive, and often needs repeated effort when a new sensor is adopted or when the detector is deployed in a new environment. We investigate a new scenario to construct 3D object detectors: learning from the predictions of a nearby unit that is equ… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02646v2-abstract-full').style.display = 'inline'; document.getElementById('2410.02646v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.02646v2-abstract-full" style="display: none;"> Accurate 3D object detection in real-world environments requires a huge amount of annotated data with high quality. Acquiring such data is tedious and expensive, and often needs repeated effort when a new sensor is adopted or when the detector is deployed in a new environment. We investigate a new scenario to construct 3D object detectors: learning from the predictions of a nearby unit that is equipped with an accurate detector. For example, when a self-driving car enters a new area, it may learn from other traffic participants whose detectors have been optimized for that area. This setting is label-efficient, sensor-agnostic, and communication-efficient: nearby units only need to share the predictions with the ego agent (e.g., car). Naively using the received predictions as ground-truths to train the detector for the ego car, however, leads to inferior performance. We systematically study the problem and identify viewpoint mismatches and mislocalization (due to synchronization and GPS errors) as the main causes, which unavoidably result in false positives, false negatives, and inaccurate pseudo labels. We propose a distance-based curriculum, first learning from closer units with similar viewpoints and subsequently improving the quality of other units' predictions via self-training. We further demonstrate that an effective pseudo label refinement module can be trained with a handful of annotated data, largely reducing the data quantity necessary to train an object detector. We validate our approach on the recently released real-world collaborative driving dataset, using reference cars' predictions as pseudo labels for the ego car. Extensive experiments including several scenarios (e.g., different sensors, detectors, and domains) demonstrate the effectiveness of our approach toward label-efficient learning of 3D perception from other units' predictions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02646v2-abstract-full').style.display = 'none'; document.getElementById('2410.02646v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.02389">arXiv:2410.02389</a> <span> [<a href="https://arxiv.org/pdf/2410.02389">pdf</a>, <a href="https://arxiv.org/format/2410.02389">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Diffusion Meets Options: Hierarchical Generative Skill Composition for Temporally-Extended Tasks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Feng%2C+Z">Zeyu Feng</a>, <a href="/search/cs?searchtype=author&query=Luan%2C+H">Hao Luan</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+K+Y">Kevin Yuchen Ma</a>, <a href="/search/cs?searchtype=author&query=Soh%2C+H">Harold Soh</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.02389v1-abstract-short" style="display: inline;"> Safe and successful deployment of robots requires not only the ability to generate complex plans but also the capacity to frequently replan and correct execution errors. This paper addresses the challenge of long-horizon trajectory planning under temporally extended objectives in a receding horizon manner. To this end, we propose DOPPLER, a data-driven hierarchical framework that generates and upd… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02389v1-abstract-full').style.display = 'inline'; document.getElementById('2410.02389v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.02389v1-abstract-full" style="display: none;"> Safe and successful deployment of robots requires not only the ability to generate complex plans but also the capacity to frequently replan and correct execution errors. This paper addresses the challenge of long-horizon trajectory planning under temporally extended objectives in a receding horizon manner. To this end, we propose DOPPLER, a data-driven hierarchical framework that generates and updates plans based on instruction specified by linear temporal logic (LTL). Our method decomposes temporal tasks into chain of options with hierarchical reinforcement learning from offline non-expert datasets. It leverages diffusion models to generate options with low-level actions. We devise a determinantal-guided posterior sampling technique during batch generation, which improves the speed and diversity of diffusion generated options, leading to more efficient querying. Experiments on robot navigation and manipulation tasks demonstrate that DOPPLER can generate sequences of trajectories that progressively satisfy the specified formulae for obstacle avoidance and sequential visitation. Demonstration videos are available online at: https://philiptheother.github.io/doppler/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02389v1-abstract-full').style.display = 'none'; document.getElementById('2410.02389v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.18695">arXiv:2409.18695</a> <span> [<a href="https://arxiv.org/pdf/2409.18695">pdf</a>, <a href="https://arxiv.org/format/2409.18695">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computational Engineering, Finance, and Science">cs.CE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> KALE-LM: Unleash The Power Of AI For Science Via Knowledge And Logic Enhanced Large Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dai%2C+W">Weichen Dai</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yezeng Chen</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+Z">Zijie Dai</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Z">Zhijie Huang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yubo Liu</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+Y">Yixuan Pan</a>, <a href="/search/cs?searchtype=author&query=Song%2C+B">Baiyang Song</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+C">Chengli Zhong</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xinhe Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zeyu Wang</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Z">Zhuoying Feng</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yi Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.18695v1-abstract-short" style="display: inline;"> Artificial intelligence is gradually demonstrating its immense potential, and increasing attention is being given to how AI can be harnessed to advance scientific research. In this vision paper, we present our perspectives on how AI can better assist scientific inquiry and explore corresponding technical approach. We have proposed and open-sourced a large model of our KALE-LM model series, Llama3-… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.18695v1-abstract-full').style.display = 'inline'; document.getElementById('2409.18695v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.18695v1-abstract-full" style="display: none;"> Artificial intelligence is gradually demonstrating its immense potential, and increasing attention is being given to how AI can be harnessed to advance scientific research. In this vision paper, we present our perspectives on how AI can better assist scientific inquiry and explore corresponding technical approach. We have proposed and open-sourced a large model of our KALE-LM model series, Llama3-KALE-LM-Chem-8B, which has achieved outstanding performance in tasks related to the field of chemistry. We hope that our work serves as a strong starting point, helping to realize more intelligent AI and promoting the advancement of human science and technology, as well as societal development. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.18695v1-abstract-full').style.display = 'none'; document.getElementById('2409.18695v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.17630">arXiv:2409.17630</a> <span> [<a href="https://arxiv.org/pdf/2409.17630">pdf</a>, <a href="https://arxiv.org/format/2409.17630">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> System-Level Safety Monitoring and Recovery for Perception Failures in Autonomous Vehicles </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chakraborty%2C+K">Kaustav Chakraborty</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Z">Zeyuan Feng</a>, <a href="/search/cs?searchtype=author&query=Veer%2C+S">Sushant Veer</a>, <a href="/search/cs?searchtype=author&query=Sharma%2C+A">Apoorva Sharma</a>, <a href="/search/cs?searchtype=author&query=Ivanovic%2C+B">Boris Ivanovic</a>, <a href="/search/cs?searchtype=author&query=Pavone%2C+M">Marco Pavone</a>, <a href="/search/cs?searchtype=author&query=Bansal%2C+S">Somil Bansal</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.17630v2-abstract-short" style="display: inline;"> The safety-critical nature of autonomous vehicle (AV) operation necessitates development of task-relevant algorithms that can reason about safety at the system level and not just at the component level. To reason about the impact of a perception failure on the entire system performance, such task-relevant algorithms must contend with various challenges: complexity of AV stacks, high uncertainty in… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.17630v2-abstract-full').style.display = 'inline'; document.getElementById('2409.17630v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.17630v2-abstract-full" style="display: none;"> The safety-critical nature of autonomous vehicle (AV) operation necessitates development of task-relevant algorithms that can reason about safety at the system level and not just at the component level. To reason about the impact of a perception failure on the entire system performance, such task-relevant algorithms must contend with various challenges: complexity of AV stacks, high uncertainty in the operating environments, and the need for real-time performance. To overcome these challenges, in this work, we introduce a Q-network called SPARQ (abbreviation for Safety evaluation for Perception And Recovery Q-network) that evaluates the safety of a plan generated by a planning algorithm, accounting for perception failures that the planning process may have overlooked. This Q-network can be queried during system runtime to assess whether a proposed plan is safe for execution or poses potential safety risks. If a violation is detected, the network can then recommend a corrective plan while accounting for the perceptual failure. We validate our algorithm using the NuPlan-Vegas dataset, demonstrating its ability to handle cases where a perception failure compromises a proposed plan while the corrective plan remains safe. We observe an overall accuracy and recall of 90% while sustaining a frequency of 42Hz on the unseen testing dataset. We compare our performance to a popular reachability-based baseline and analyze some interesting properties of our approach in improving the safety properties of an AV pipeline. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.17630v2-abstract-full').style.display = 'none'; document.getElementById('2409.17630v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.17531">arXiv:2409.17531</a> <span> [<a href="https://arxiv.org/pdf/2409.17531">pdf</a>, <a href="https://arxiv.org/format/2409.17531">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> SimVG: A Simple Framework for Visual Grounding with Decoupled Multi-modal Fusion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dai%2C+M">Ming Dai</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+L">Lingfeng Yang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yihao Xu</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Z">Zhenhua Feng</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+W">Wankou Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.17531v2-abstract-short" style="display: inline;"> Visual grounding is a common vision task that involves grounding descriptive sentences to the corresponding regions of an image. Most existing methods use independent image-text encoding and apply complex hand-crafted modules or encoder-decoder architectures for modal interaction and query reasoning. However, their performance significantly drops when dealing with complex textual expressions. This… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.17531v2-abstract-full').style.display = 'inline'; document.getElementById('2409.17531v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.17531v2-abstract-full" style="display: none;"> Visual grounding is a common vision task that involves grounding descriptive sentences to the corresponding regions of an image. Most existing methods use independent image-text encoding and apply complex hand-crafted modules or encoder-decoder architectures for modal interaction and query reasoning. However, their performance significantly drops when dealing with complex textual expressions. This is because the former paradigm only utilizes limited downstream data to fit the multi-modal feature fusion. Therefore, it is only effective when the textual expressions are relatively simple. In contrast, given the wide diversity of textual expressions and the uniqueness of downstream training data, the existing fusion module, which extracts multimodal content from a visual-linguistic context, has not been fully investigated. In this paper, we present a simple yet robust transformer-based framework, SimVG, for visual grounding. Specifically, we decouple visual-linguistic feature fusion from downstream tasks by leveraging existing multimodal pre-trained models and incorporating additional object tokens to facilitate deep integration of downstream and pre-training tasks. Furthermore, we design a dynamic weight-balance distillation method in the multi-branch synchronous learning process to enhance the representation capability of the simpler branch. This branch only consists of a lightweight MLP, which simplifies the structure and improves reasoning speed. Experiments on six widely used VG datasets, i.e., RefCOCO/+/g, ReferIt, Flickr30K, and GRefCOCO, demonstrate the superiority of SimVG. Finally, the proposed method not only achieves improvements in efficiency and convergence speed but also attains new state-of-the-art performance on these benchmarks. Codes and models will be available at \url{https://github.com/Dmmm1997/SimVG}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.17531v2-abstract-full').style.display = 'none'; document.getElementById('2409.17531v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">24pages, 18figures, NeurIPS2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.14882">arXiv:2409.14882</a> <span> [<a href="https://arxiv.org/pdf/2409.14882">pdf</a>, <a href="https://arxiv.org/format/2409.14882">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Probabilistically Aligned View-unaligned Clustering with Adaptive Template Selection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dong%2C+W">Wenhua Dong</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+X">Xiao-Jun Wu</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Z">Zhenhua Feng</a>, <a href="/search/cs?searchtype=author&query=Atito%2C+S">Sara Atito</a>, <a href="/search/cs?searchtype=author&query=Awais%2C+M">Muhammad Awais</a>, <a href="/search/cs?searchtype=author&query=Kittler%2C+J">Josef Kittler</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.14882v1-abstract-short" style="display: inline;"> In most existing multi-view modeling scenarios, cross-view correspondence (CVC) between instances of the same target from different views, like paired image-text data, is a crucial prerequisite for effortlessly deriving a consistent representation. Nevertheless, this premise is frequently compromised in certain applications, where each view is organized and transmitted independently, resulting in… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.14882v1-abstract-full').style.display = 'inline'; document.getElementById('2409.14882v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.14882v1-abstract-full" style="display: none;"> In most existing multi-view modeling scenarios, cross-view correspondence (CVC) between instances of the same target from different views, like paired image-text data, is a crucial prerequisite for effortlessly deriving a consistent representation. Nevertheless, this premise is frequently compromised in certain applications, where each view is organized and transmitted independently, resulting in the view-unaligned problem (VuP). Restoring CVC of unaligned multi-view data is a challenging and highly demanding task that has received limited attention from the research community. To tackle this practical challenge, we propose to integrate the permutation derivation procedure into the bipartite graph paradigm for view-unaligned clustering, termed Probabilistically Aligned View-unaligned Clustering with Adaptive Template Selection (PAVuC-ATS). Specifically, we learn consistent anchors and view-specific graphs by the bipartite graph, and derive permutations applied to the unaligned graphs by reformulating the alignment between two latent representations as a 2-step transition of a Markov chain with adaptive template selection, thereby achieving the probabilistic alignment. The convergence of the resultant optimization problem is validated both experimentally and theoretically. Extensive experiments on six benchmark datasets demonstrate the superiority of the proposed PAVuC-ATS over the baseline methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.14882v1-abstract-full').style.display = 'none'; document.getElementById('2409.14882v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 6 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68T10 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.12411">arXiv:2409.12411</a> <span> [<a href="https://arxiv.org/pdf/2409.12411">pdf</a>, <a href="https://arxiv.org/format/2409.12411">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Textualized Agent-Style Reasoning for Complex Tasks by Multiple Round LLM Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liang%2C+C">Chen Liang</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Z">Zhifan Feng</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zihe Liu</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+W">Wenbin Jiang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jinan Xu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yufeng Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yong Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.12411v1-abstract-short" style="display: inline;"> Chain-of-thought prompting significantly boosts the reasoning ability of large language models but still faces three issues: hallucination problem, restricted interpretability, and uncontrollable generation. To address these challenges, we present AgentCOT, a llm-based autonomous agent framework, which can solve complex problems in an agent-style manner by multiple round LLM generation. At each st… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12411v1-abstract-full').style.display = 'inline'; document.getElementById('2409.12411v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.12411v1-abstract-full" style="display: none;"> Chain-of-thought prompting significantly boosts the reasoning ability of large language models but still faces three issues: hallucination problem, restricted interpretability, and uncontrollable generation. To address these challenges, we present AgentCOT, a llm-based autonomous agent framework, which can solve complex problems in an agent-style manner by multiple round LLM generation. At each step, AgentCOT selects an action and executes it to yield an intermediate result with supporting evidence. In addition, we integrate the step's index into the reasoning process to form a graph structure for complex inference logic. We introduce two new strategies to enhance the performance of AgentCOT.We conduct extensive experiments to verify the effectiveness of our method on six common benchmarks. Results exhibit that our method brings in substantial improvements over current competitive approaches. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12411v1-abstract-full').style.display = 'none'; document.getElementById('2409.12411v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.03218">arXiv:2409.03218</a> <span> [<a href="https://arxiv.org/pdf/2409.03218">pdf</a>, <a href="https://arxiv.org/format/2409.03218">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Performance">cs.PF</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Application Research On Real-Time Perception Of Device Performance Status </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhe Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhen Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jianwen Wu</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+W">Wangzhong Xiao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yidong Chen</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Z">Zihua Feng</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+D">Dian Yang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Hongchen Liu</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+B">Bo Liang</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+J">Jiaojiao Fu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.03218v1-abstract-short" style="display: inline;"> In order to accurately identify the performance status of mobile devices and finely adjust the user experience, a real-time performance perception evaluation method based on TOPSIS (Technique for Order Preference by Similarity to Ideal Solution) combined with entropy weighting method and time series model construction was studied. After collecting the performance characteristics of various mobile… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.03218v1-abstract-full').style.display = 'inline'; document.getElementById('2409.03218v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.03218v1-abstract-full" style="display: none;"> In order to accurately identify the performance status of mobile devices and finely adjust the user experience, a real-time performance perception evaluation method based on TOPSIS (Technique for Order Preference by Similarity to Ideal Solution) combined with entropy weighting method and time series model construction was studied. After collecting the performance characteristics of various mobile devices, the device performance profile was fitted by using PCA (principal component analysis) dimensionality reduction and feature engineering methods such as descriptive time series analysis. The ability of performance features and profiles to describe the real-time performance status of devices was understood and studied by applying the TOPSIS method and multi-level weighting processing. A time series model was constructed for the feature set under objective weighting, and multiple sensitivity (real-time, short-term, long-term) performance status perception results were provided to obtain real-time performance evaluation data and long-term stable performance prediction data. Finally, by configuring dynamic AB experiments and overlaying fine-grained power reduction strategies, the usability of the method was verified, and the accuracy of device performance status identification and prediction was compared with the performance of the profile features including dimensionality reduction time series modeling, TOPSIS method and entropy weighting method, subjective weighting, HMA method. The results show that accurate real-time performance perception results can greatly enhance business value, and this research has application effectiveness and certain forward-looking significance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.03218v1-abstract-full').style.display = 'none'; document.getElementById('2409.03218v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.16415">arXiv:2408.16415</a> <span> [<a href="https://arxiv.org/pdf/2408.16415">pdf</a>, <a href="https://arxiv.org/format/2408.16415">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Emerging Technologies">cs.ET</span> </div> </div> <p class="title is-5 mathjax"> UAV's Rotor Micro-Doppler Feature Extraction Using Integrated Sensing and Communication Signal: Algorithm Design and Testbed Evaluation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wei%2C+J">Jiachen Wei</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+D">Dingyou Ma</a>, <a href="/search/cs?searchtype=author&query=He%2C+F">Feiyang He</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Qixun Zhang</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Z">Zhiyong Feng</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhengfeng Liu</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+T">Taohong Liang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.16415v1-abstract-short" style="display: inline;"> With the rapid application of unmanned aerial vehicles (UAVs) in urban areas, the identification and tracking of hovering UAVs have become critical challenges, significantly impacting the safety of aircraft take-off and landing operations. As a promising technology for 6G mobile systems, integrated sensing and communication (ISAC) can be used to detect high-mobility UAVs with a low deployment cost… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.16415v1-abstract-full').style.display = 'inline'; document.getElementById('2408.16415v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.16415v1-abstract-full" style="display: none;"> With the rapid application of unmanned aerial vehicles (UAVs) in urban areas, the identification and tracking of hovering UAVs have become critical challenges, significantly impacting the safety of aircraft take-off and landing operations. As a promising technology for 6G mobile systems, integrated sensing and communication (ISAC) can be used to detect high-mobility UAVs with a low deployment cost. The micro-Doppler signals from UAV rotors can be leveraged to address the detection of low-mobility and hovering UAVs using ISAC signals. However, determining whether the frame structure of the ISAC system can be used to identify UAVs, and how to accurately capture the weak rotor micro-Doppler signals of UAVs in complex environments, remain two challenging problems. This paper first proposes a novel frame structure for UAV micro-Doppler extraction and the representation of UAV micro-Doppler signals within the channel state information (CSI). Furthermore, to address complex environments and the interference caused by UAV body vibrations, the rotor micro-Doppler null space pursuit (rmD-NSP) algorithm and the feature extraction algorithm synchroextracting transform (SET) are designed to effectively separate UAV's rotor micro-Doppler signals and enhance their features in the spectrogram. Finally, both simulation and hardware testbed demonstrate that the proposed rmD-NSP algorithm enables the ISAC base station (BS) to accurately and completely extract UAV's rotor micro-Doppler signals. Within a 0.1s observation period, ISAC BS successfully captures eight rotations of the DJI M300 RTK UAV's rotor in urban environments. Compared to the existing AM-FM NSP and NSP signal decomposition algorithms, the integrity of the rotor micro-Doppler features is improved by 60%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.16415v1-abstract-full').style.display = 'none'; document.getElementById('2408.16415v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.14047">arXiv:2408.14047</a> <span> [<a href="https://arxiv.org/pdf/2408.14047">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Alleviating Class Imbalance in Semi-supervised Multi-organ Segmentation via Balanced Subclass Regularization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Feng%2C+Z">Zhenghao Feng</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+L">Lu Wen</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+B">Binyu Yan</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+J">Jiaqi Cui</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yan Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.14047v1-abstract-short" style="display: inline;"> Semi-supervised learning (SSL) has shown notable potential in relieving the heavy demand of dense prediction tasks on large-scale well-annotated datasets, especially for the challenging multi-organ segmentation (MoS). However, the prevailing class-imbalance problem in MoS, caused by the substantial variations in organ size, exacerbates the learning difficulty of the SSL network. To alleviate this… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.14047v1-abstract-full').style.display = 'inline'; document.getElementById('2408.14047v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.14047v1-abstract-full" style="display: none;"> Semi-supervised learning (SSL) has shown notable potential in relieving the heavy demand of dense prediction tasks on large-scale well-annotated datasets, especially for the challenging multi-organ segmentation (MoS). However, the prevailing class-imbalance problem in MoS, caused by the substantial variations in organ size, exacerbates the learning difficulty of the SSL network. To alleviate this issue, we present a two-phase semi-supervised network (BSR-Net) with balanced subclass regularization for MoS. Concretely, in Phase I, we introduce a class-balanced subclass generation strategy based on balanced clustering to effectively generate multiple balanced subclasses from original biased ones according to their pixel proportions. Then, in Phase II, we design an auxiliary subclass segmentation (SCS) task within the multi-task framework of the main MoS task. The SCS task contributes a balanced subclass regularization to the main MoS task and transfers unbiased knowledge to the MoS network, thus alleviating the influence of the class-imbalance problem. Extensive experiments conducted on two publicly available datasets, i.e., the MICCAI FLARE 2022 dataset and the WORD dataset, verify the superior performance of our method compared with other methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.14047v1-abstract-full').style.display = 'none'; document.getElementById('2408.14047v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.13981">arXiv:2408.13981</a> <span> [<a href="https://arxiv.org/pdf/2408.13981">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ARANet: Attention-based Residual Adversarial Network with Deep Supervision for Radiotherapy Dose Prediction of Cervical Cancer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wen%2C+L">Lu Wen</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+W">Wenxia Yin</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Z">Zhenghao Feng</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+X">Xi Wu</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+D">Deng Xiong</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yan Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.13981v1-abstract-short" style="display: inline;"> Radiation therapy is the mainstay treatment for cervical cancer, and its ultimate goal is to ensure the planning target volume (PTV) reaches the prescribed dose while reducing dose deposition of organs-at-risk (OARs) as much as possible. To achieve these clinical requirements, the medical physicist needs to manually tweak the radiotherapy plan repeatedly in a trial-anderror manner until finding th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.13981v1-abstract-full').style.display = 'inline'; document.getElementById('2408.13981v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.13981v1-abstract-full" style="display: none;"> Radiation therapy is the mainstay treatment for cervical cancer, and its ultimate goal is to ensure the planning target volume (PTV) reaches the prescribed dose while reducing dose deposition of organs-at-risk (OARs) as much as possible. To achieve these clinical requirements, the medical physicist needs to manually tweak the radiotherapy plan repeatedly in a trial-anderror manner until finding the optimal one in the clinic. However, such trial-and-error processes are quite time-consuming, and the quality of plans highly depends on the experience of the medical physicist. In this paper, we propose an end-to-end Attentionbased Residual Adversarial Network with deep supervision, namely ARANet, to automatically predict the 3D dose distribution of cervical cancer. Specifically, given the computer tomography (CT) images and their corresponding segmentation masks of PTV and OARs, ARANet employs a prediction network to generate the dose maps. We also utilize a multi-scale residual attention module and deep supervision mechanism to enforce the prediction network to extract more valuable dose features while suppressing irrelevant information. Our proposed method is validated on an in-house dataset including 54 cervical cancer patients, and experimental results have demonstrated its obvious superiority compared to other state-of-the-art methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.13981v1-abstract-full').style.display = 'none'; document.getElementById('2408.13981v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by 2024 IEEE International Conference on Cybernetics and Intelligent Systems (CIS) and IEEE Conference on Robotics, Automation and Mechatronics (RAM)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.13397">arXiv:2408.13397</a> <span> [<a href="https://arxiv.org/pdf/2408.13397">pdf</a>, <a href="https://arxiv.org/format/2408.13397">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Perturbation on Feature Coalition: Towards Interpretable Deep Neural Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+X">Xuran Hu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+M">Mingzhe Zhu</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Z">Zhenpeng Feng</a>, <a href="/search/cs?searchtype=author&query=Dakovi%C4%87%2C+M">Milo拧 Dakovi膰</a>, <a href="/search/cs?searchtype=author&query=Stankovi%C4%87%2C+L">Ljubi拧a Stankovi膰</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.13397v1-abstract-short" style="display: inline;"> The inherent "black box" nature of deep neural networks (DNNs) compromises their transparency and reliability. Recently, explainable AI (XAI) has garnered increasing attention from researchers. Several perturbation-based interpretations have emerged. However, these methods often fail to adequately consider feature dependencies. To solve this problem, we introduce a perturbation-based interpretatio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.13397v1-abstract-full').style.display = 'inline'; document.getElementById('2408.13397v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.13397v1-abstract-full" style="display: none;"> The inherent "black box" nature of deep neural networks (DNNs) compromises their transparency and reliability. Recently, explainable AI (XAI) has garnered increasing attention from researchers. Several perturbation-based interpretations have emerged. However, these methods often fail to adequately consider feature dependencies. To solve this problem, we introduce a perturbation-based interpretation guided by feature coalitions, which leverages deep information of network to extract correlated features. Then, we proposed a carefully-designed consistency loss to guide network interpretation. Both quantitative and qualitative experiments are conducted to validate the effectiveness of our proposed method. Code is available at github.com/Teriri1999/Perturebation-on-Feature-Coalition. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.13397v1-abstract-full').style.display = 'none'; document.getElementById('2408.13397v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">4 pages, 4 figures, 2 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.11554">arXiv:2408.11554</a> <span> [<a href="https://arxiv.org/pdf/2408.11554">pdf</a>, <a href="https://arxiv.org/format/2408.11554">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Differentiating Choices via Commonality for Multiple-Choice Question Answering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Deng%2C+W">Wenqing Deng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhe Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+K">Kewen Wang</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+S">Shirui Pan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiaowang Zhang</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Z">Zhiyong Feng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.11554v1-abstract-short" style="display: inline;"> Multiple-choice question answering (MCQA) becomes particularly challenging when all choices are relevant to the question and are semantically similar. Yet this setting of MCQA can potentially provide valuable clues for choosing the right answer. Existing models often rank each choice separately, overlooking the context provided by other choices. Specifically, they fail to leverage the semantic com… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.11554v1-abstract-full').style.display = 'inline'; document.getElementById('2408.11554v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.11554v1-abstract-full" style="display: none;"> Multiple-choice question answering (MCQA) becomes particularly challenging when all choices are relevant to the question and are semantically similar. Yet this setting of MCQA can potentially provide valuable clues for choosing the right answer. Existing models often rank each choice separately, overlooking the context provided by other choices. Specifically, they fail to leverage the semantic commonalities and nuances among the choices for reasoning. In this paper, we propose a novel MCQA model by differentiating choices through identifying and eliminating their commonality, called DCQA. Our model captures token-level attention of each choice to the question, and separates tokens of the question attended to by all the choices (i.e., commonalities) from those by individual choices (i.e., nuances). Using the nuances as refined contexts for the choices, our model can effectively differentiate choices with subtle differences and provide justifications for choosing the correct answer. We conduct comprehensive experiments across five commonly used MCQA benchmarks, demonstrating that DCQA consistently outperforms baseline models. Furthermore, our case study illustrates the effectiveness of the approach in directing the attention of the model to more differentiating features. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.11554v1-abstract-full').style.display = 'none'; document.getElementById('2408.11554v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages, accepted to ECAI 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.09469">arXiv:2408.09469</a> <span> [<a href="https://arxiv.org/pdf/2408.09469">pdf</a>, <a href="https://arxiv.org/format/2408.09469">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Adversarial Transferability with Adversarial Weight Tuning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jiahao Chen</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Z">Zhou Feng</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+R">Rui Zeng</a>, <a href="/search/cs?searchtype=author&query=Pu%2C+Y">Yuwen Pu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+C">Chunyi Zhou</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Y">Yi Jiang</a>, <a href="/search/cs?searchtype=author&query=Gan%2C+Y">Yuyou Gan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jinbao Li</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+S">Shouling Ji</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.09469v2-abstract-short" style="display: inline;"> Deep neural networks (DNNs) are vulnerable to adversarial examples (AEs) that mislead the model while appearing benign to human observers. A critical concern is the transferability of AEs, which enables black-box attacks without direct access to the target model. However, many previous attacks have failed to explain the intrinsic mechanism of adversarial transferability. In this paper, we rethink… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.09469v2-abstract-full').style.display = 'inline'; document.getElementById('2408.09469v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.09469v2-abstract-full" style="display: none;"> Deep neural networks (DNNs) are vulnerable to adversarial examples (AEs) that mislead the model while appearing benign to human observers. A critical concern is the transferability of AEs, which enables black-box attacks without direct access to the target model. However, many previous attacks have failed to explain the intrinsic mechanism of adversarial transferability. In this paper, we rethink the property of transferable AEs and reformalize the formulation of transferability. Building on insights from this mechanism, we analyze the generalization of AEs across models with different architectures and prove that we can find a local perturbation to mitigate the gap between surrogate and target models. We further establish the inner connections between model smoothness and flat local maxima, both of which contribute to the transferability of AEs. Further, we propose a new adversarial attack algorithm, \textbf{A}dversarial \textbf{W}eight \textbf{T}uning (AWT), which adaptively adjusts the parameters of the surrogate model using generated AEs to optimize the flat local maxima and model smoothness simultaneously, without the need for extra data. AWT is a data-free tuning method that combines gradient-based and model-based attack methods to enhance the transferability of AEs. Extensive experiments on a variety of models with different architectures on ImageNet demonstrate that AWT yields superior performance over other attacks, with an average increase of nearly 5\% and 10\% attack success rates on CNN-based and Transformer-based models, respectively, compared to state-of-the-art attacks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.09469v2-abstract-full').style.display = 'none'; document.getElementById('2408.09469v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.09441">arXiv:2408.09441</a> <span> [<a href="https://arxiv.org/pdf/2408.09441">pdf</a>, <a href="https://arxiv.org/format/2408.09441">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> CLIP-CID: Efficient CLIP Distillation via Cluster-Instance Discrimination </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+K">Kaicheng Yang</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+T">Tiancheng Gu</a>, <a href="/search/cs?searchtype=author&query=An%2C+X">Xiang An</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+H">Haiqiang Jiang</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+X">Xiangzi Dai</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Z">Ziyong Feng</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+W">Weidong Cai</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+J">Jiankang Deng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.09441v1-abstract-short" style="display: inline;"> Contrastive Language-Image Pre-training (CLIP) has achieved excellent performance over a wide range of tasks. However, the effectiveness of CLIP heavily relies on a substantial corpus of pre-training data, resulting in notable consumption of computational resources. Although knowledge distillation has been widely applied in single modality models, how to efficiently expand knowledge distillation t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.09441v1-abstract-full').style.display = 'inline'; document.getElementById('2408.09441v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.09441v1-abstract-full" style="display: none;"> Contrastive Language-Image Pre-training (CLIP) has achieved excellent performance over a wide range of tasks. However, the effectiveness of CLIP heavily relies on a substantial corpus of pre-training data, resulting in notable consumption of computational resources. Although knowledge distillation has been widely applied in single modality models, how to efficiently expand knowledge distillation to vision-language foundation models with extensive data remains relatively unexplored. In this paper, we introduce CLIP-CID, a novel distillation mechanism that effectively transfers knowledge from a large vision-language foundation model to a smaller model. We initially propose a simple but efficient image semantic balance method to reduce transfer learning bias and improve distillation efficiency. This method filters out 43.7% of image-text pairs from the LAION400M while maintaining superior performance. After that, we leverage cluster-instance discrimination to facilitate knowledge transfer from the teacher model to the student model, thereby empowering the student model to acquire a holistic semantic comprehension of the pre-training data. Experimental results demonstrate that CLIP-CID achieves state-of-the-art performance on various downstream tasks including linear probe and zero-shot classification. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.09441v1-abstract-full').style.display = 'none'; document.getElementById('2408.09441v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages,8 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.07685">arXiv:2408.07685</a> <span> [<a href="https://arxiv.org/pdf/2408.07685">pdf</a>, <a href="https://arxiv.org/ps/2408.07685">ps</a>, <a href="https://arxiv.org/format/2408.07685">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> </div> </div> <p class="title is-5 mathjax"> Auto-bidding and Auctions in Online Advertising: A Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Aggarwal%2C+G">Gagan Aggarwal</a>, <a href="/search/cs?searchtype=author&query=Badanidiyuru%2C+A">Ashwinkumar Badanidiyuru</a>, <a href="/search/cs?searchtype=author&query=Balseiro%2C+S+R">Santiago R. Balseiro</a>, <a href="/search/cs?searchtype=author&query=Bhawalkar%2C+K">Kshipra Bhawalkar</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+Y">Yuan Deng</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Z">Zhe Feng</a>, <a href="/search/cs?searchtype=author&query=Goel%2C+G">Gagan Goel</a>, <a href="/search/cs?searchtype=author&query=Liaw%2C+C">Christopher Liaw</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Haihao Lu</a>, <a href="/search/cs?searchtype=author&query=Mahdian%2C+M">Mohammad Mahdian</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+J">Jieming Mao</a>, <a href="/search/cs?searchtype=author&query=Mehta%2C+A">Aranyak Mehta</a>, <a href="/search/cs?searchtype=author&query=Mirrokni%2C+V">Vahab Mirrokni</a>, <a href="/search/cs?searchtype=author&query=Leme%2C+R+P">Renato Paes Leme</a>, <a href="/search/cs?searchtype=author&query=Perlroth%2C+A">Andres Perlroth</a>, <a href="/search/cs?searchtype=author&query=Piliouras%2C+G">Georgios Piliouras</a>, <a href="/search/cs?searchtype=author&query=Schneider%2C+J">Jon Schneider</a>, <a href="/search/cs?searchtype=author&query=Schvartzman%2C+A">Ariel Schvartzman</a>, <a href="/search/cs?searchtype=author&query=Sivan%2C+B">Balasubramanian Sivan</a>, <a href="/search/cs?searchtype=author&query=Spendlove%2C+K">Kelly Spendlove</a>, <a href="/search/cs?searchtype=author&query=Teng%2C+Y">Yifeng Teng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+D">Di Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hanrui Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+M">Mingfei Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+W">Wennan Zhu</a> , et al. (1 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.07685v1-abstract-short" style="display: inline;"> In this survey, we summarize recent developments in research fueled by the growing adoption of automated bidding strategies in online advertising. We explore the challenges and opportunities that have arisen as markets embrace this autobidding and cover a range of topics in this area, including bidding algorithms, equilibrium analysis and efficiency of common auction formats, and optimal auction d… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.07685v1-abstract-full').style.display = 'inline'; document.getElementById('2408.07685v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.07685v1-abstract-full" style="display: none;"> In this survey, we summarize recent developments in research fueled by the growing adoption of automated bidding strategies in online advertising. We explore the challenges and opportunities that have arisen as markets embrace this autobidding and cover a range of topics in this area, including bidding algorithms, equilibrium analysis and efficiency of common auction formats, and optimal auction design. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.07685v1-abstract-full').style.display = 'none'; document.getElementById('2408.07685v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.07600">arXiv:2408.07600</a> <span> [<a href="https://arxiv.org/pdf/2408.07600">pdf</a>, <a href="https://arxiv.org/format/2408.07600">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Disentangle and denoise: Tackling context misalignment for video moment retrieval </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ma%2C+K">Kaijing Ma</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+H">Han Fang</a>, <a href="/search/cs?searchtype=author&query=Zang%2C+X">Xianghao Zang</a>, <a href="/search/cs?searchtype=author&query=Ban%2C+C">Chao Ban</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+L">Lanxiang Zhou</a>, <a href="/search/cs?searchtype=author&query=He%2C+Z">Zhongjiang He</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yongxiang Li</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+H">Hao Sun</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Z">Zerun Feng</a>, <a href="/search/cs?searchtype=author&query=Hou%2C+X">Xingsong Hou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.07600v1-abstract-short" style="display: inline;"> Video Moment Retrieval, which aims to locate in-context video moments according to a natural language query, is an essential task for cross-modal grounding. Existing methods focus on enhancing the cross-modal interactions between all moments and the textual description for video understanding. However, constantly interacting with all locations is unreasonable because of uneven semantic distributio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.07600v1-abstract-full').style.display = 'inline'; document.getElementById('2408.07600v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.07600v1-abstract-full" style="display: none;"> Video Moment Retrieval, which aims to locate in-context video moments according to a natural language query, is an essential task for cross-modal grounding. Existing methods focus on enhancing the cross-modal interactions between all moments and the textual description for video understanding. However, constantly interacting with all locations is unreasonable because of uneven semantic distribution across the timeline and noisy visual backgrounds. This paper proposes a cross-modal Context Denoising Network (CDNet) for accurate moment retrieval by disentangling complex correlations and denoising irrelevant dynamics.Specifically, we propose a query-guided semantic disentanglement (QSD) to decouple video moments by estimating alignment levels according to the global and fine-grained correlation. A Context-aware Dynamic Denoisement (CDD) is proposed to enhance understanding of aligned spatial-temporal details by learning a group of query-relevant offsets. Extensive experiments on public benchmarks demonstrate that the proposed CDNet achieves state-of-the-art performances. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.07600v1-abstract-full').style.display = 'none'; document.getElementById('2408.07600v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.07219">arXiv:2408.07219</a> <span> [<a href="https://arxiv.org/pdf/2408.07219">pdf</a>, <a href="https://arxiv.org/format/2408.07219">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Methodology">stat.ME</span> </div> </div> <p class="title is-5 mathjax"> Causal Effect Estimation using identifiable Variational AutoEncoder with Latent Confounders and Post-Treatment Variables </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xie%2C+Y">Yang Xie</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Ziqi Xu</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+D">Debo Cheng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jiuyong Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Lin Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yinghao Zhang</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Z">Zaiwen Feng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.07219v1-abstract-short" style="display: inline;"> Estimating causal effects from observational data is challenging, especially in the presence of latent confounders. Much work has been done on addressing this challenge, but most of the existing research ignores the bias introduced by the post-treatment variables. In this paper, we propose a novel method of joint Variational AutoEncoder (VAE) and identifiable Variational AutoEncoder (iVAE) for lea… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.07219v1-abstract-full').style.display = 'inline'; document.getElementById('2408.07219v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.07219v1-abstract-full" style="display: none;"> Estimating causal effects from observational data is challenging, especially in the presence of latent confounders. Much work has been done on addressing this challenge, but most of the existing research ignores the bias introduced by the post-treatment variables. In this paper, we propose a novel method of joint Variational AutoEncoder (VAE) and identifiable Variational AutoEncoder (iVAE) for learning the representations of latent confounders and latent post-treatment variables from their proxy variables, termed CPTiVAE, to achieve unbiased causal effect estimation from observational data. We further prove the identifiability in terms of the representation of latent post-treatment variables. Extensive experiments on synthetic and semi-synthetic datasets demonstrate that the CPTiVAE outperforms the state-of-the-art methods in the presence of latent confounders and post-treatment variables. We further apply CPTiVAE to a real-world dataset to show its potential application. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.07219v1-abstract-full').style.display = 'none'; document.getElementById('2408.07219v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.05151">arXiv:2408.05151</a> <span> [<a href="https://arxiv.org/pdf/2408.05151">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Meta-Learning Guided Label Noise Distillation for Robust Signal Modulation Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hao%2C+X">Xiaoyang Hao</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Z">Zhixi Feng</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+T">Tongqing Peng</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Shuyuan Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.05151v1-abstract-short" style="display: inline;"> Automatic modulation classification (AMC) is an effective way to deal with physical layer threats of the internet of things (IoT). However, there is often label mislabeling in practice, which significantly impacts the performance and robustness of deep neural networks (DNNs). In this paper, we propose a meta-learning guided label noise distillation method for robust AMC. Specifically, a teacher-st… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.05151v1-abstract-full').style.display = 'inline'; document.getElementById('2408.05151v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.05151v1-abstract-full" style="display: none;"> Automatic modulation classification (AMC) is an effective way to deal with physical layer threats of the internet of things (IoT). However, there is often label mislabeling in practice, which significantly impacts the performance and robustness of deep neural networks (DNNs). In this paper, we propose a meta-learning guided label noise distillation method for robust AMC. Specifically, a teacher-student heterogeneous network (TSHN) framework is proposed to distill and reuse label noise. Based on the idea that labels are representations, the teacher network with trusted meta-learning divides and conquers untrusted label samples and then guides the student network to learn better by reassessing and correcting labels. Furthermore, we propose a multi-view signal (MVS) method to further improve the performance of hard-to-classify categories with few-shot trusted label samples. Extensive experimental results show that our methods can significantly improve the performance and robustness of signal AMC in various and complex label noise scenarios, which is crucial for securing IoT applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.05151v1-abstract-full').style.display = 'none'; document.getElementById('2408.05151v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 7 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2; C.2 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.02970">arXiv:2408.02970</a> <span> [<a href="https://arxiv.org/pdf/2408.02970">pdf</a>, <a href="https://arxiv.org/format/2408.02970">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> EC-Guide: A Comprehensive E-Commerce Guide for Instruction Tuning and Quantization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Feng%2C+Z">Zhaopeng Feng</a>, <a href="/search/cs?searchtype=author&query=Meng%2C+Z">Zijie Meng</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zuozhu Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.02970v1-abstract-short" style="display: inline;"> Large language models (LLMs) have attracted considerable attention in various fields for their cost-effective solutions to diverse challenges, especially with advancements in instruction tuning and quantization. E-commerce, with its complex tasks and extensive product-user interactions, presents a promising application area for LLMs. However, the domain-specific concepts and knowledge inherent in… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.02970v1-abstract-full').style.display = 'inline'; document.getElementById('2408.02970v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.02970v1-abstract-full" style="display: none;"> Large language models (LLMs) have attracted considerable attention in various fields for their cost-effective solutions to diverse challenges, especially with advancements in instruction tuning and quantization. E-commerce, with its complex tasks and extensive product-user interactions, presents a promising application area for LLMs. However, the domain-specific concepts and knowledge inherent in e-commerce pose significant challenges for adapting general LLMs. To address this issue, we developed EC-Guide \href{https://github.com/fzp0424/EC-Guide-KDDUP-2024}, a comprehensive e-commerce guide for instruction tuning and quantization of LLMs. We also heuristically integrated Chain-of-Thought (CoT) during inference to enhance arithmetic performance. Our approach achieved the 2nd place in Track 2 and 5th place in Track 5 at the Amazon KDD Cup'24 \href{https://www.aicrowd.com/challenges/amazon-kdd-cup-2024-multi-task-online-shopping-challenge-for-llms}. Additionally, our solution is model-agnostic, enabling effective scalability across larger systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.02970v1-abstract-full').style.display = 'none'; document.getElementById('2408.02970v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.01553">arXiv:2408.01553</a> <span> [<a href="https://arxiv.org/pdf/2408.01553">pdf</a>, <a href="https://arxiv.org/format/2408.01553">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Multi-task SAR Image Processing via GAN-based Unsupervised Manipulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+X">Xuran Hu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+M">Mingzhe Zhu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Ziqiang Xu</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Z">Zhenpeng Feng</a>, <a href="/search/cs?searchtype=author&query=Stankovic%2C+L">Ljubisa Stankovic</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.01553v1-abstract-short" style="display: inline;"> Generative Adversarial Networks (GANs) have shown tremendous potential in synthesizing a large number of realistic SAR images by learning patterns in the data distribution. Some GANs can achieve image editing by introducing latent codes, demonstrating significant promise in SAR image processing. Compared to traditional SAR image processing methods, editing based on GAN latent space control is enti… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.01553v1-abstract-full').style.display = 'inline'; document.getElementById('2408.01553v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.01553v1-abstract-full" style="display: none;"> Generative Adversarial Networks (GANs) have shown tremendous potential in synthesizing a large number of realistic SAR images by learning patterns in the data distribution. Some GANs can achieve image editing by introducing latent codes, demonstrating significant promise in SAR image processing. Compared to traditional SAR image processing methods, editing based on GAN latent space control is entirely unsupervised, allowing image processing to be conducted without any labeled data. Additionally, the information extracted from the data is more interpretable. This paper proposes a novel SAR image processing framework called GAN-based Unsupervised Editing (GUE), aiming to address the following two issues: (1) disentangling semantic directions in the GAN latent space and finding meaningful directions; (2) establishing a comprehensive SAR image processing framework while achieving multiple image processing functions. In the implementation of GUE, we decompose the entangled semantic directions in the GAN latent space by training a carefully designed network. Moreover, we can accomplish multiple SAR image processing tasks (including despeckling, localization, auxiliary identification, and rotation editing) in a single training process without any form of supervision. Extensive experiments validate the effectiveness of the proposed method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.01553v1-abstract-full').style.display = 'none'; document.getElementById('2408.01553v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">19 pages, 17 figures, 7 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.01181">arXiv:2408.01181</a> <span> [<a href="https://arxiv.org/pdf/2408.01181">pdf</a>, <a href="https://arxiv.org/format/2408.01181">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> VAR-CLIP: Text-to-Image Generator with Visual Auto-Regressive Modeling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Qian Zhang</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+X">Xiangzi Dai</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+N">Ninghua Yang</a>, <a href="/search/cs?searchtype=author&query=An%2C+X">Xiang An</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Z">Ziyong Feng</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+X">Xingyu Ren</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.01181v1-abstract-short" style="display: inline;"> VAR is a new generation paradigm that employs 'next-scale prediction' as opposed to 'next-token prediction'. This innovative transformation enables auto-regressive (AR) transformers to rapidly learn visual distributions and achieve robust generalization. However, the original VAR model is constrained to class-conditioned synthesis, relying solely on textual captions for guidance. In this paper, we… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.01181v1-abstract-full').style.display = 'inline'; document.getElementById('2408.01181v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.01181v1-abstract-full" style="display: none;"> VAR is a new generation paradigm that employs 'next-scale prediction' as opposed to 'next-token prediction'. This innovative transformation enables auto-regressive (AR) transformers to rapidly learn visual distributions and achieve robust generalization. However, the original VAR model is constrained to class-conditioned synthesis, relying solely on textual captions for guidance. In this paper, we introduce VAR-CLIP, a novel text-to-image model that integrates Visual Auto-Regressive techniques with the capabilities of CLIP. The VAR-CLIP framework encodes captions into text embeddings, which are then utilized as textual conditions for image generation. To facilitate training on extensive datasets, such as ImageNet, we have constructed a substantial image-text dataset leveraging BLIP2. Furthermore, we delve into the significance of word positioning within CLIP for the purpose of caption guidance. Extensive experiments confirm VAR-CLIP's proficiency in generating fantasy images with high fidelity, textual congruence, and aesthetic excellence. Our project page are https://github.com/daixiangzi/VAR-CLIP <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.01181v1-abstract-full').style.display = 'none'; document.getElementById('2408.01181v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">total 10 pages, code:https://github.com/daixiangzi/VAR-CLIP</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.21022">arXiv:2407.21022</a> <span> [<a href="https://arxiv.org/pdf/2407.21022">pdf</a>, <a href="https://arxiv.org/format/2407.21022">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> A Comprehensive Survey on Retrieval Methods in Recommender Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+J">Junjie Huang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jizheng Chen</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+J">Jianghao Lin</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+J">Jiarui Qin</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Z">Ziming Feng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Weinan Zhang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Y">Yong Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.21022v1-abstract-short" style="display: inline;"> In an era dominated by information overload, effective recommender systems are essential for managing the deluge of data across digital platforms. Multi-stage cascade ranking systems are widely used in the industry, with retrieval and ranking being two typical stages. Retrieval methods sift through vast candidates to filter out irrelevant items, while ranking methods prioritize these candidates to… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.21022v1-abstract-full').style.display = 'inline'; document.getElementById('2407.21022v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.21022v1-abstract-full" style="display: none;"> In an era dominated by information overload, effective recommender systems are essential for managing the deluge of data across digital platforms. Multi-stage cascade ranking systems are widely used in the industry, with retrieval and ranking being two typical stages. Retrieval methods sift through vast candidates to filter out irrelevant items, while ranking methods prioritize these candidates to present the most relevant items to users. Unlike studies focusing on the ranking stage, this survey explores the critical yet often overlooked retrieval stage of recommender systems. To achieve precise and efficient personalized retrieval, we summarize existing work in three key areas: improving similarity computation between user and item, enhancing indexing mechanisms for efficient retrieval, and optimizing training methods of retrieval. We also provide a comprehensive set of benchmarking experiments on three public datasets. Furthermore, we highlight current industrial applications through a case study on retrieval practices at a specific company, covering the entire retrieval process and online serving, along with practical implications and challenges. By detailing the retrieval stage, which is fundamental for effective recommendation, this survey aims to bridge the existing knowledge gap and serve as a cornerstone for researchers interested in optimizing this critical component of cascade recommender systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.21022v1-abstract-full').style.display = 'none'; document.getElementById('2407.21022v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">38 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.20265">arXiv:2407.20265</a> <span> [<a href="https://arxiv.org/pdf/2407.20265">pdf</a>, <a href="https://arxiv.org/format/2407.20265">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computational Engineering, Finance, and Science">cs.CE</span> </div> </div> <p class="title is-5 mathjax"> COEFF-KANs: A Paradigm to Address the Electrolyte Field with KANs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+X">Xinhe Li</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Z">Zhuoying Feng</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yezeng Chen</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+W">Weichen Dai</a>, <a href="/search/cs?searchtype=author&query=He%2C+Z">Zixu He</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yi Zhou</a>, <a href="/search/cs?searchtype=author&query=Jiao%2C+S">Shuhong Jiao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.20265v1-abstract-short" style="display: inline;"> To reduce the experimental validation workload for chemical researchers and accelerate the design and optimization of high-energy-density lithium metal batteries, we aim to leverage models to automatically predict Coulombic Efficiency (CE) based on the composition of liquid electrolytes. There are mainly two representative paradigms in existing methods: machine learning and deep learning. However,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.20265v1-abstract-full').style.display = 'inline'; document.getElementById('2407.20265v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.20265v1-abstract-full" style="display: none;"> To reduce the experimental validation workload for chemical researchers and accelerate the design and optimization of high-energy-density lithium metal batteries, we aim to leverage models to automatically predict Coulombic Efficiency (CE) based on the composition of liquid electrolytes. There are mainly two representative paradigms in existing methods: machine learning and deep learning. However, the former requires intelligent input feature selection and reliable computational methods, leading to error propagation from feature estimation to model prediction, while the latter (e.g. MultiModal-MoLFormer) faces challenges of poor predictive performance and overfitting due to limited diversity in augmented data. To tackle these issues, we propose a novel method COEFF (COlumbic EFficiency prediction via Fine-tuned models), which consists of two stages: pre-training a chemical general model and fine-tuning on downstream domain data. Firstly, we adopt the publicly available MoLFormer model to obtain feature vectors for each solvent and salt in the electrolyte. Then, we perform a weighted average of embeddings for each token across all molecules, with weights determined by the respective electrolyte component ratios. Finally, we input the obtained electrolyte features into a Multi-layer Perceptron or Kolmogorov-Arnold Network to predict CE. Experimental results on a real-world dataset demonstrate that our method achieves SOTA for predicting CE compared to all baselines. Data and code used in this work will be made publicly available after the paper is published. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.20265v1-abstract-full').style.display = 'none'; document.getElementById('2407.20265v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.17331">arXiv:2407.17331</a> <span> [<a href="https://arxiv.org/pdf/2407.17331">pdf</a>, <a href="https://arxiv.org/format/2407.17331">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1007/978-3-031-73383-3_25">10.1007/978-3-031-73383-3_25 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Multi-label Cluster Discrimination for Visual Representation Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=An%2C+X">Xiang An</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+K">Kaicheng Yang</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+X">Xiangzi Dai</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Z">Ziyong Feng</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+J">Jiankang Deng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.17331v2-abstract-short" style="display: inline;"> Contrastive Language Image Pre-training (CLIP) has recently demonstrated success across various tasks due to superior feature representation empowered by image-text contrastive learning. However, the instance discrimination method used by CLIP can hardly encode the semantic structure of training data. To handle this limitation, cluster discrimination has been proposed through iterative cluster ass… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.17331v2-abstract-full').style.display = 'inline'; document.getElementById('2407.17331v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.17331v2-abstract-full" style="display: none;"> Contrastive Language Image Pre-training (CLIP) has recently demonstrated success across various tasks due to superior feature representation empowered by image-text contrastive learning. However, the instance discrimination method used by CLIP can hardly encode the semantic structure of training data. To handle this limitation, cluster discrimination has been proposed through iterative cluster assignment and classification. Nevertheless, most cluster discrimination approaches only define a single pseudo-label for each image, neglecting multi-label signals in the image. In this paper, we propose a novel Multi-Label Cluster Discrimination method named MLCD to enhance representation learning. In the clustering step, we first cluster the large-scale LAION-400M dataset into one million centers based on off-the-shelf embedding features. Considering that natural images frequently contain multiple visual objects or attributes, we select the multiple closest centers as auxiliary class labels. In the discrimination step, we design a novel multi-label classification loss, which elegantly separates losses from positive classes and negative classes, and alleviates ambiguity on decision boundary. We validate the proposed multi-label cluster discrimination method with experiments on different scales of models and pre-training datasets. Experimental results show that our method achieves state-of-the-art performance on multiple downstream tasks including linear probe, zero-shot classification, and image-text retrieval. Code and models have been released at https://github.com/deepglint/unicom . <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.17331v2-abstract-full').style.display = 'none'; document.getElementById('2407.17331v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ECCV2024</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Feng%2C+Z&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Feng%2C+Z&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Feng%2C+Z&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Feng%2C+Z&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Feng%2C+Z&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Feng%2C+Z&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository