CINXE.COM
Search | arXiv e-print repository
<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 788 results for author: <span class="mathjax">Wu, F</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Wu%2C+F">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Wu, F"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Wu%2C+F&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Wu, F"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Wu%2C+F&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Wu%2C+F&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Wu%2C+F&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Wu%2C+F&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Wu%2C+F&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Wu%2C+F&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13787">arXiv:2411.13787</a> <span> [<a href="https://arxiv.org/pdf/2411.13787">pdf</a>, <a href="https://arxiv.org/format/2411.13787">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Edge-Cloud Routing for Text-to-Image Model with Token-Level Multi-Metric Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xin%2C+Z">Zewei Xin</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Q">Qinya Li</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+C">Chaoyue Niu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+F">Fan Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13787v1-abstract-short" style="display: inline;"> Large text-to-image models demonstrate impressive generation capabilities; however, their substantial size necessitates expensive cloud servers for deployment. Conversely, light-weight models can be deployed on edge devices at lower cost but often with inferior generation quality for complex user prompts. To strike a balance between performance and cost, we propose a routing framework, called \tex… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13787v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13787v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13787v1-abstract-full" style="display: none;"> Large text-to-image models demonstrate impressive generation capabilities; however, their substantial size necessitates expensive cloud servers for deployment. Conversely, light-weight models can be deployed on edge devices at lower cost but often with inferior generation quality for complex user prompts. To strike a balance between performance and cost, we propose a routing framework, called \texttt{RouteT2I}, which dynamically selects either the large cloud model or the light-weight edge model for each user prompt. Since generated image quality is challenging to measure directly, \texttt{RouteT2I} establishes multi-dimensional quality metrics, particularly, by evaluating the similarity between the generated images and both positive and negative texts that describe each specific quality metric. \texttt{RouteT2I} then predicts the expected quality of the generated images by identifying key tokens in the prompt and comparing their impact on the quality. \texttt{RouteT2I} further introduces the Pareto relative superiority to compare the multi-metric quality of the generated images. Based on this comparison and predefined cost constraints, \texttt{RouteT2I} allocates prompts to either the edge or the cloud. Evaluation reveals that \texttt{RouteT2I} significantly reduces the number of requesting large cloud model while maintaining high-quality image generation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13787v1-abstract-full').style.display = 'none'; document.getElementById('2411.13787v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13740">arXiv:2411.13740</a> <span> [<a href="https://arxiv.org/pdf/2411.13740">pdf</a>, <a href="https://arxiv.org/format/2411.13740">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> Federated Continual Learning for Edge-AI: A Comprehensive Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zi Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+F">Fei Wu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+F">Feng Yu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yurui Zhou</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+J">Jia Hu</a>, <a href="/search/cs?searchtype=author&query=Min%2C+G">Geyong Min</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13740v1-abstract-short" style="display: inline;"> Edge-AI, the convergence of edge computing and artificial intelligence (AI), has become a promising paradigm that enables the deployment of advanced AI models at the network edge, close to users. In Edge-AI, federated continual learning (FCL) has emerged as an imperative framework, which fuses knowledge from different clients while preserving data privacy and retaining knowledge from previous task… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13740v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13740v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13740v1-abstract-full" style="display: none;"> Edge-AI, the convergence of edge computing and artificial intelligence (AI), has become a promising paradigm that enables the deployment of advanced AI models at the network edge, close to users. In Edge-AI, federated continual learning (FCL) has emerged as an imperative framework, which fuses knowledge from different clients while preserving data privacy and retaining knowledge from previous tasks as it learns new ones. By so doing, FCL aims to ensure stable and reliable performance of learning models in dynamic and distributed environments. In this survey, we thoroughly review the state-of-the-art research and present the first comprehensive survey of FCL for Edge-AI. We categorize FCL methods based on three task characteristics: federated class continual learning, federated domain continual learning, and federated task continual learning. For each category, an in-depth investigation and review of the representative methods are provided, covering background, challenges, problem formalisation, solutions, and limitations. Besides, existing real-world applications empowered by FCL are reviewed, indicating the current progress and potential of FCL in diverse application domains. Furthermore, we discuss and highlight several prospective research directions of FCL such as algorithm-hardware co-design for FCL and FCL with foundation models, which could provide insights into the future development and practical deployment of FCL in the era of Edge-AI. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13740v1-abstract-full').style.display = 'none'; document.getElementById('2411.13740v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13602">arXiv:2411.13602</a> <span> [<a href="https://arxiv.org/pdf/2411.13602">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Large-scale cross-modality pretrained model enhances cardiovascular state estimation and cardiomyopathy detection from electrocardiograms: An AI system development and multi-center validation study </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ding%2C+Z">Zhengyao Ding</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yujian Hu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Youyao Xu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+C">Chengchen Zhao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Ziyu Li</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+Y">Yiheng Mao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Haitao Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Q">Qian Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jing Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yue Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+M">Mengjia Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Longbo Wang</a>, <a href="/search/cs?searchtype=author&query=Chu%2C+X">Xuesen Chu</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+W">Weichao Pan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Ziyi Liu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+F">Fei Wu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hongkun Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Ting Chen</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Z">Zhengxing Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13602v1-abstract-short" style="display: inline;"> Cardiovascular diseases (CVDs) present significant challenges for early and accurate diagnosis. While cardiac magnetic resonance imaging (CMR) is the gold standard for assessing cardiac function and diagnosing CVDs, its high cost and technical complexity limit accessibility. In contrast, electrocardiography (ECG) offers promise for large-scale early screening. This study introduces CardiacNets, an… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13602v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13602v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13602v1-abstract-full" style="display: none;"> Cardiovascular diseases (CVDs) present significant challenges for early and accurate diagnosis. While cardiac magnetic resonance imaging (CMR) is the gold standard for assessing cardiac function and diagnosing CVDs, its high cost and technical complexity limit accessibility. In contrast, electrocardiography (ECG) offers promise for large-scale early screening. This study introduces CardiacNets, an innovative model that enhances ECG analysis by leveraging the diagnostic strengths of CMR through cross-modal contrastive learning and generative pretraining. CardiacNets serves two primary functions: (1) it evaluates detailed cardiac function indicators and screens for potential CVDs, including coronary artery disease, cardiomyopathy, pericarditis, heart failure and pulmonary hypertension, using ECG input; and (2) it enhances interpretability by generating high-quality CMR images from ECG data. We train and validate the proposed CardiacNets on two large-scale public datasets (the UK Biobank with 41,519 individuals and the MIMIC-IV-ECG comprising 501,172 samples) as well as three private datasets (FAHZU with 410 individuals, SAHZU with 464 individuals, and QPH with 338 individuals), and the findings demonstrate that CardiacNets consistently outperforms traditional ECG-only models, substantially improving screening accuracy. Furthermore, the generated CMR images provide valuable diagnostic support for physicians of all experience levels. This proof-of-concept study highlights how ECG can facilitate cross-modal insights into cardiac function assessment, paving the way for enhanced CVD screening and diagnosis at a population level. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13602v1-abstract-full').style.display = 'none'; document.getElementById('2411.13602v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">23 pages, 8 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12629">arXiv:2411.12629</a> <span> [<a href="https://arxiv.org/pdf/2411.12629">pdf</a>, <a href="https://arxiv.org/format/2411.12629">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Astrophysics of Galaxies">astro-ph.GA</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cosmology and Nongalactic Astrophysics">astro-ph.CO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Instrumentation and Methods for Astrophysics">astro-ph.IM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Estimating Dark Matter Halo Masses in Simulated Galaxy Clusters with Graph Neural Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Garuda%2C+N">Nikhil Garuda</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J+F">John F. Wu</a>, <a href="/search/cs?searchtype=author&query=Nelson%2C+D">Dylan Nelson</a>, <a href="/search/cs?searchtype=author&query=Pillepich%2C+A">Annalisa Pillepich</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12629v1-abstract-short" style="display: inline;"> Galaxies grow and evolve in dark matter halos. Because dark matter is not visible, galaxies' halo masses ($\rm{M}_{\rm{halo}}$) must be inferred indirectly. We present a graph neural network (GNN) model for predicting $\rm{M}_{\rm{halo}}$ from stellar mass ($\rm{M}_{*}$) in simulated galaxy clusters using data from the IllustrisTNG simulation suite. Unlike traditional machine learning models like… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12629v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12629v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12629v1-abstract-full" style="display: none;"> Galaxies grow and evolve in dark matter halos. Because dark matter is not visible, galaxies' halo masses ($\rm{M}_{\rm{halo}}$) must be inferred indirectly. We present a graph neural network (GNN) model for predicting $\rm{M}_{\rm{halo}}$ from stellar mass ($\rm{M}_{*}$) in simulated galaxy clusters using data from the IllustrisTNG simulation suite. Unlike traditional machine learning models like random forests, our GNN captures the information-rich substructure of galaxy clusters by using spatial and kinematic relationships between galaxy neighbour. A GNN model trained on the TNG-Cluster dataset and independently tested on the TNG300 simulation achieves superior predictive performance compared to other baseline models we tested. Future work will extend this approach to different simulations and real observational datasets to further validate the GNN model's ability to generalise. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12629v1-abstract-full').style.display = 'none'; document.getElementById('2411.12629v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages, 4 figures, accepted at the NeurIPS ML4PS 2024 workshop</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12089">arXiv:2411.12089</a> <span> [<a href="https://arxiv.org/pdf/2411.12089">pdf</a>, <a href="https://arxiv.org/format/2411.12089">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> FruitNinja: 3D Object Interior Texture Generation with Gaussian Splatting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+F">Fangyu Wu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yuhao Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12089v2-abstract-short" style="display: inline;"> In the real world, objects reveal internal textures when sliced or cut, yet this behavior is not well-studied in 3D generation tasks today. For example, slicing a virtual 3D watermelon should reveal flesh and seeds. Given that no available dataset captures an object's full internal structure and collecting data from all slices is impractical, generative methods become the obvious approach. However… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12089v2-abstract-full').style.display = 'inline'; document.getElementById('2411.12089v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12089v2-abstract-full" style="display: none;"> In the real world, objects reveal internal textures when sliced or cut, yet this behavior is not well-studied in 3D generation tasks today. For example, slicing a virtual 3D watermelon should reveal flesh and seeds. Given that no available dataset captures an object's full internal structure and collecting data from all slices is impractical, generative methods become the obvious approach. However, current 3D generation and inpainting methods often focus on visible appearance and overlook internal textures. To bridge this gap, we introduce FruitNinja, the first method to generate internal textures for 3D objects undergoing geometric and topological changes. Our approach produces objects via 3D Gaussian Splatting (3DGS) with both surface and interior textures synthesized, enabling real-time slicing and rendering without additional optimization. FruitNinja leverages a pre-trained diffusion model to progressively inpaint cross-sectional views and applies voxel-grid-based smoothing to achieve cohesive textures throughout the object. Our OpaqueAtom GS strategy overcomes 3DGS limitations by employing densely distributed opaque Gaussians, avoiding biases toward larger particles that destabilize training and sharp color transitions for fine-grained textures. Experimental results show that FruitNinja substantially outperforms existing approaches, showcasing unmatched visual quality in real-time rendered internal views across arbitrary geometry manipulations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12089v2-abstract-full').style.display = 'none'; document.getElementById('2411.12089v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10687">arXiv:2411.10687</a> <span> [<a href="https://arxiv.org/pdf/2411.10687">pdf</a>, <a href="https://arxiv.org/format/2411.10687">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> EDBooks: AI-Enhanced Interactive Narratives for Programming Education </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Oney%2C+S">Steve Oney</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+Y">Yue Shen</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+F">Fei Wu</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+Y+S">Young Suh Hong</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Ziang Wang</a>, <a href="/search/cs?searchtype=author&query=Khajekar%2C+Y">Yamini Khajekar</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiacheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+A+Y">April Yi Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10687v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) have shown the potential to be valuable teaching tools, with the potential of giving every student a personalized tutor. However, one challenge with using LLMs to learn new concepts is that when learning a topic in an unfamiliar domain, it can be difficult to know what questions to ask. Further, language models do not always encourage "active learning" where students c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10687v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10687v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10687v1-abstract-full" style="display: none;"> Large Language Models (LLMs) have shown the potential to be valuable teaching tools, with the potential of giving every student a personalized tutor. However, one challenge with using LLMs to learn new concepts is that when learning a topic in an unfamiliar domain, it can be difficult to know what questions to ask. Further, language models do not always encourage "active learning" where students can test and assess their understanding. In this paper, we propose ways to combine large language models with "traditional" learning materials (like e-books) to give readers the benefits of working with LLMs (the ability to ask personally interesting questions and receive personalized answers) with the benefits of a traditional e-book (having a structure and content that is pedagogically sound). This work shows one way that LLMs have the potential to improve learning materials and make personalized programming education more accessible to a broader audience. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10687v1-abstract-full').style.display = 'none'; document.getElementById('2411.10687v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">21 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10618">arXiv:2411.10618</a> <span> [<a href="https://arxiv.org/pdf/2411.10618">pdf</a>, <a href="https://arxiv.org/format/2411.10618">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computational Engineering, Finance, and Science">cs.CE</span> </div> </div> <p class="title is-5 mathjax"> D-Flow: Multi-modality Flow Matching for D-peptide Design </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+F">Fang Wu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+T">Tinson Xu</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+S">Shuting Jin</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+X">Xiangru Tang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zerui Xu</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+J">James Zou</a>, <a href="/search/cs?searchtype=author&query=Hie%2C+B">Brian Hie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10618v1-abstract-short" style="display: inline;"> Proteins play crucial roles in biological processes, with therapeutic peptides emerging as promising pharmaceutical agents. They allow new possibilities to leverage target binding sites that were previously undruggable. While deep learning (DL) has advanced peptide discovery, generating D-proteins composed of D-amino acids remains challenging due to the scarcity of natural examples. This paper pro… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10618v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10618v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10618v1-abstract-full" style="display: none;"> Proteins play crucial roles in biological processes, with therapeutic peptides emerging as promising pharmaceutical agents. They allow new possibilities to leverage target binding sites that were previously undruggable. While deep learning (DL) has advanced peptide discovery, generating D-proteins composed of D-amino acids remains challenging due to the scarcity of natural examples. This paper proposes D-Flow, a full-atom flow-based framework for {de novo} D-peptide design. D-Flow is conditioned on receptor binding and utilizes a comprehensive representation of peptide structure, incorporating backbone frames, side-chain angles, and discrete amino acid types. A mirror-image algorithm is implemented to address the lack of training data for D-proteins, which converts L-receptors' chirality. Furthermore, we enhance D-Flow's capacity by integrating large protein language models (PLMs) with structural awareness through a lightweight structural adapter. A two-stage training pipeline and a controlling toolkit also enable D-Flow to transition from general protein design to targeted binder design while preserving pretraining knowledge. Extensive experimental results on the PepMerge benchmark demonstrate D-Flow's effectiveness, particularly in developing peptides with entire D-residues. This approach represents a significant advancement in computational D-peptide design, offering unique opportunities for bioorthogonal and stable molecular tools and diagnostics. The code is available in~\url{https://github.com/smiles724/PeptideDesign}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10618v1-abstract-full').style.display = 'none'; document.getElementById('2411.10618v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06493">arXiv:2411.06493</a> <span> [<a href="https://arxiv.org/pdf/2411.06493">pdf</a>, <a href="https://arxiv.org/format/2411.06493">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> LProtector: An LLM-driven Vulnerability Detection System </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sheng%2C+Z">Ze Sheng</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+F">Fenghua Wu</a>, <a href="/search/cs?searchtype=author&query=Zuo%2C+X">Xiangwu Zuo</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chao Li</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+Y">Yuxin Qiao</a>, <a href="/search/cs?searchtype=author&query=Hang%2C+L">Lei Hang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06493v2-abstract-short" style="display: inline;"> This paper presents LProtector, an automated vulnerability detection system for C/C++ codebases driven by the large language model (LLM) GPT-4o and Retrieval-Augmented Generation (RAG). As software complexity grows, traditional methods face challenges in detecting vulnerabilities effectively. LProtector leverages GPT-4o's powerful code comprehension and generation capabilities to perform binary cl… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06493v2-abstract-full').style.display = 'inline'; document.getElementById('2411.06493v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06493v2-abstract-full" style="display: none;"> This paper presents LProtector, an automated vulnerability detection system for C/C++ codebases driven by the large language model (LLM) GPT-4o and Retrieval-Augmented Generation (RAG). As software complexity grows, traditional methods face challenges in detecting vulnerabilities effectively. LProtector leverages GPT-4o's powerful code comprehension and generation capabilities to perform binary classification and identify vulnerabilities within target codebases. We conducted experiments on the Big-Vul dataset, showing that LProtector outperforms two state-of-the-art baselines in terms of F1 score, demonstrating the potential of integrating LLMs with vulnerability detection. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06493v2-abstract-full').style.display = 'none'; document.getElementById('2411.06493v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 4 figures. This is a preprint version of the article. The final version will be published in the proceedings of the IEEE conference</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04399">arXiv:2411.04399</a> <span> [<a href="https://arxiv.org/pdf/2411.04399">pdf</a>, <a href="https://arxiv.org/format/2411.04399">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ProGraph: Temporally-alignable Probability Guided Graph Topological Modeling for 3D Human Reconstruction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hongsheng Wang</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Z">Zehui Feng</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+T">Tong Xiao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+G">Genfan Yang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shengyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+F">Fei Wu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+F">Feng Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04399v1-abstract-short" style="display: inline;"> Current 3D human motion reconstruction methods from monocular videos rely on features within the current reconstruction window, leading to distortion and deformations in the human structure under local occlusions or blurriness in video frames. To estimate realistic 3D human mesh sequences based on incomplete features, we propose Temporally-alignable Probability Guided Graph Topological Modeling fo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04399v1-abstract-full').style.display = 'inline'; document.getElementById('2411.04399v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04399v1-abstract-full" style="display: none;"> Current 3D human motion reconstruction methods from monocular videos rely on features within the current reconstruction window, leading to distortion and deformations in the human structure under local occlusions or blurriness in video frames. To estimate realistic 3D human mesh sequences based on incomplete features, we propose Temporally-alignable Probability Guided Graph Topological Modeling for 3D Human Reconstruction (ProGraph). For missing parts recovery, we exploit the explicit topological-aware probability distribution across the entire motion sequence. To restore the complete human, Graph Topological Modeling (GTM) learns the underlying topological structure, focusing on the relationships inherent in the individual parts. Next, to generate blurred motion parts, Temporal-alignable Probability Distribution (TPDist) utilizes the GTM to predict features based on distribution. This interactive mechanism facilitates motion consistency, allowing the restoration of human parts. Furthermore, Hierarchical Human Loss (HHLoss) constrains the probability distribution errors of inter-frame features during topological structure variation. Our Method achieves superior results than other SOTA methods in addressing occlusions and blurriness on 3DPW. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04399v1-abstract-full').style.display = 'none'; document.getElementById('2411.04399v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03957">arXiv:2411.03957</a> <span> [<a href="https://arxiv.org/pdf/2411.03957">pdf</a>, <a href="https://arxiv.org/format/2411.03957">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Fine-Grained Guidance for Retrievers: Leveraging LLMs' Feedback in Retrieval-Augmented Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yuhang Liu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+X">Xueyu Hu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shengyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jingyuan Chen</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+F">Fan Wu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+F">Fei Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03957v1-abstract-short" style="display: inline;"> Retrieval-Augmented Generation (RAG) has proven to be an effective method for mitigating hallucination issues inherent in large language models (LLMs). Previous approaches typically train retrievers based on semantic similarity, lacking optimization for RAG. More recent works have proposed aligning retrievers with the preference signals of LLMs. However, these preference signals are often difficul… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03957v1-abstract-full').style.display = 'inline'; document.getElementById('2411.03957v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03957v1-abstract-full" style="display: none;"> Retrieval-Augmented Generation (RAG) has proven to be an effective method for mitigating hallucination issues inherent in large language models (LLMs). Previous approaches typically train retrievers based on semantic similarity, lacking optimization for RAG. More recent works have proposed aligning retrievers with the preference signals of LLMs. However, these preference signals are often difficult for dense retrievers, which typically have weaker language capabilities, to understand and learn effectively. Drawing inspiration from pedagogical theories like Guided Discovery Learning, we propose a novel framework, FiGRet (Fine-grained Guidance for Retrievers), which leverages the language capabilities of LLMs to construct examples from a more granular, information-centric perspective to guide the learning of retrievers. Specifically, our method utilizes LLMs to construct easy-to-understand examples from samples where the retriever performs poorly, focusing on three learning objectives highly relevant to the RAG scenario: relevance, comprehensiveness, and purity. These examples serve as scaffolding to ultimately align the retriever with the LLM's preferences. Furthermore, we employ a dual curriculum learning strategy and leverage the reciprocal feedback between LLM and retriever to further enhance the performance of the RAG system. A series of experiments demonstrate that our proposed framework enhances the performance of RAG systems equipped with different retrievers and is applicable to various LLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03957v1-abstract-full').style.display = 'none'; document.getElementById('2411.03957v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 4 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02086">arXiv:2411.02086</a> <span> [<a href="https://arxiv.org/pdf/2411.02086">pdf</a>, <a href="https://arxiv.org/format/2411.02086">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Real-time and Downtime-tolerant Fault Diagnosis for Railway Turnout Machines (RTMs) Empowered with Cloud-Edge Pipeline Parallelism </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+F">Fan Wu</a>, <a href="/search/cs?searchtype=author&query=Bilal%2C+M">Muhammad Bilal</a>, <a href="/search/cs?searchtype=author&query=Xiang%2C+H">Haolong Xiang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Heng Wang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+J">Jinjun Yu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xiaolong Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02086v1-abstract-short" style="display: inline;"> Railway Turnout Machines (RTMs) are mission-critical components of the railway transportation infrastructure, responsible for directing trains onto desired tracks. For safety assurance applications, especially in early-warning scenarios, RTM faults are expected to be detected as early as possible on a continuous 7x24 basis. However, limited emphasis has been placed on distributed model inference f… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02086v1-abstract-full').style.display = 'inline'; document.getElementById('2411.02086v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02086v1-abstract-full" style="display: none;"> Railway Turnout Machines (RTMs) are mission-critical components of the railway transportation infrastructure, responsible for directing trains onto desired tracks. For safety assurance applications, especially in early-warning scenarios, RTM faults are expected to be detected as early as possible on a continuous 7x24 basis. However, limited emphasis has been placed on distributed model inference frameworks that can meet the inference latency and reliability requirements of such mission critical fault diagnosis systems. In this paper, an edge-cloud collaborative early-warning system is proposed to enable real-time and downtime-tolerant fault diagnosis of RTMs, providing a new paradigm for the deployment of models in safety-critical scenarios. Firstly, a modular fault diagnosis model is designed specifically for distributed deployment, which utilizes a hierarchical architecture consisting of the prior knowledge module, subordinate classifiers, and a fusion layer for enhanced accuracy and parallelism. Then, a cloud-edge collaborative framework leveraging pipeline parallelism, namely CEC-PA, is developed to minimize the overhead resulting from distributed task execution and context exchange by strategically partitioning and offloading model components across cloud and edge. Additionally, an election consensus mechanism is implemented within CEC-PA to ensure system robustness during coordinator node downtime. Comparative experiments and ablation studies are conducted to validate the effectiveness of the proposed distributed fault diagnosis approach. Our ensemble-based fault diagnosis model achieves a remarkable 97.4% accuracy on a real-world dataset collected by Nanjing Metro in Jiangsu Province, China. Meanwhile, CEC-PA demonstrates superior recovery proficiency during node disruptions and speed-up ranging from 1.98x to 7.93x in total inference time compared to its counterparts. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02086v1-abstract-full').style.display = 'none'; document.getElementById('2411.02086v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00696">arXiv:2411.00696</a> <span> [<a href="https://arxiv.org/pdf/2411.00696">pdf</a>, <a href="https://arxiv.org/format/2411.00696">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> CTPD: Cross-Modal Temporal Pattern Discovery for Enhanced Multimodal Electronic Health Records Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+F">Fuying Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+F">Feng Wu</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Y">Yihan Tang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+L">Lequan Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00696v1-abstract-short" style="display: inline;"> Integrating multimodal Electronic Health Records (EHR) data, such as numerical time series and free-text clinical reports, has great potential in predicting clinical outcomes. However, prior work has primarily focused on capturing temporal interactions within individual samples and fusing multimodal information, overlooking critical temporal patterns across patients. These patterns, such as trends… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00696v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00696v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00696v1-abstract-full" style="display: none;"> Integrating multimodal Electronic Health Records (EHR) data, such as numerical time series and free-text clinical reports, has great potential in predicting clinical outcomes. However, prior work has primarily focused on capturing temporal interactions within individual samples and fusing multimodal information, overlooking critical temporal patterns across patients. These patterns, such as trends in vital signs like abnormal heart rate or blood pressure, can indicate deteriorating health or an impending critical event. Similarly, clinical notes often contain textual descriptions that reflect these patterns. Identifying corresponding temporal patterns across different modalities is crucial for improving the accuracy of clinical outcome predictions, yet it remains a challenging task. To address this gap, we introduce a Cross-Modal Temporal Pattern Discovery (CTPD) framework, designed to efficiently extract meaningful cross-modal temporal patterns from multimodal EHR data. Our approach introduces shared initial temporal pattern representations which are refined using slot attention to generate temporal semantic embeddings. To ensure rich cross-modal temporal semantics in the learned patterns, we introduce a contrastive-based TPNCE loss for cross-modal alignment, along with two reconstruction losses to retain core information of each modality. Evaluations on two clinically critical tasks, 48-hour in-hospital mortality and 24-hour phenotype classification, using the MIMIC-III database demonstrate the superiority of our method over existing approaches. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00696v1-abstract-full').style.display = 'none'; document.getElementById('2411.00696v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Technical report</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.24028">arXiv:2410.24028</a> <span> [<a href="https://arxiv.org/pdf/2410.24028">pdf</a>, <a href="https://arxiv.org/format/2410.24028">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> AdaFlow: Opportunistic Inference on Asynchronous Mobile Data with Generalized Affinity Control </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+F">Fenmin Wu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Sicong Liu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+K">Kehao Zhu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiaochen Li</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+B">Bin Guo</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Z">Zhiwen Yu</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+H">Hongkai Wen</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xiangrui Xu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Lehao Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiangyu Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.24028v1-abstract-short" style="display: inline;"> The rise of mobile devices equipped with numerous sensors, such as LiDAR and cameras, has spurred the adoption of multi-modal deep intelligence for distributed sensing tasks, such as smart cabins and driving assistance. However, the arrival times of mobile sensory data vary due to modality size and network dynamics, which can lead to delays (if waiting for slower data) or accuracy decline (if infe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.24028v1-abstract-full').style.display = 'inline'; document.getElementById('2410.24028v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.24028v1-abstract-full" style="display: none;"> The rise of mobile devices equipped with numerous sensors, such as LiDAR and cameras, has spurred the adoption of multi-modal deep intelligence for distributed sensing tasks, such as smart cabins and driving assistance. However, the arrival times of mobile sensory data vary due to modality size and network dynamics, which can lead to delays (if waiting for slower data) or accuracy decline (if inference proceeds without waiting). Moreover, the diversity and dynamic nature of mobile systems exacerbate this challenge. In response, we present a shift to \textit{opportunistic} inference for asynchronous distributed multi-modal data, enabling inference as soon as partial data arrives. While existing methods focus on optimizing modality consistency and complementarity, known as modal affinity, they lack a \textit{computational} approach to control this affinity in open-world mobile environments. AdaFlow pioneers the formulation of structured cross-modality affinity in mobile contexts using a hierarchical analysis-based normalized matrix. This approach accommodates the diversity and dynamics of modalities, generalizing across different types and numbers of inputs. Employing an affinity attention-based conditional GAN (ACGAN), AdaFlow facilitates flexible data imputation, adapting to various modalities and downstream tasks without retraining. Experiments show that AdaFlow significantly reduces inference latency by up to 79.9\% and enhances accuracy by up to 61.9\%, outperforming status quo approaches. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.24028v1-abstract-full').style.display = 'none'; document.getElementById('2410.24028v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.22806">arXiv:2410.22806</a> <span> [<a href="https://arxiv.org/pdf/2410.22806">pdf</a>, <a href="https://arxiv.org/format/2410.22806">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Discrete Mathematics">cs.DM</span> </div> </div> <p class="title is-5 mathjax"> MILP-StuDio: MILP Instance Generation via Block Structure Decomposition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+H">Haoyang Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jie Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wanbo Zhang</a>, <a href="/search/cs?searchtype=author&query=Geng%2C+Z">Zijie Geng</a>, <a href="/search/cs?searchtype=author&query=Kuang%2C+Y">Yufei Kuang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xijun Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Bin Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yongdong Zhang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+F">Feng Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.22806v2-abstract-short" style="display: inline;"> Mixed-integer linear programming (MILP) is one of the most popular mathematical formulations with numerous applications. In practice, improving the performance of MILP solvers often requires a large amount of high-quality data, which can be challenging to collect. Researchers thus turn to generation techniques to generate additional MILP instances. However, existing approaches do not take into acc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22806v2-abstract-full').style.display = 'inline'; document.getElementById('2410.22806v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.22806v2-abstract-full" style="display: none;"> Mixed-integer linear programming (MILP) is one of the most popular mathematical formulations with numerous applications. In practice, improving the performance of MILP solvers often requires a large amount of high-quality data, which can be challenging to collect. Researchers thus turn to generation techniques to generate additional MILP instances. However, existing approaches do not take into account specific block structures -- which are closely related to the problem formulations -- in the constraint coefficient matrices (CCMs) of MILPs. Consequently, they are prone to generate computationally trivial or infeasible instances due to the disruptions of block structures and thus problem formulations. To address this challenge, we propose a novel MILP generation framework, called Block Structure Decomposition (MILP-StuDio), to generate high-quality instances by preserving the block structures. Specifically, MILP-StuDio begins by identifying the blocks in CCMs and decomposing the instances into block units, which serve as the building blocks of MILP instances. We then design three operators to construct new instances by removing, substituting, and appending block units in the original instances, enabling us to generate instances with flexible sizes. An appealing feature of MILP-StuDio is its strong ability to preserve the feasibility and computational hardness of the generated instances. Experiments on the commonly-used benchmarks demonstrate that using instances generated by MILP-StuDio is able to significantly reduce over 10% of the solving time for learning-based solvers. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22806v2-abstract-full').style.display = 'none'; document.getElementById('2410.22806v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published in the 38th Conference on Neural Information Processing Systems (NeurIPS 2024)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.21492">arXiv:2410.21492</a> <span> [<a href="https://arxiv.org/pdf/2410.21492">pdf</a>, <a href="https://arxiv.org/format/2410.21492">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> FATH: Authentication-based Test-time Defense against Indirect Prompt Injection Attacks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jiongxiao Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+F">Fangzhou Wu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wendi Li</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+J">Jinsheng Pan</a>, <a href="/search/cs?searchtype=author&query=Suh%2C+E">Edward Suh</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+Z+M">Z. Morley Mao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+M">Muhao Chen</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+C">Chaowei Xiao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.21492v1-abstract-short" style="display: inline;"> Large language models (LLMs) have been widely deployed as the backbone with additional tools and text information for real-world applications. However, integrating external information into LLM-integrated applications raises significant security concerns. Among these, prompt injection attacks are particularly threatening, where malicious instructions injected in the external text information can e… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21492v1-abstract-full').style.display = 'inline'; document.getElementById('2410.21492v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.21492v1-abstract-full" style="display: none;"> Large language models (LLMs) have been widely deployed as the backbone with additional tools and text information for real-world applications. However, integrating external information into LLM-integrated applications raises significant security concerns. Among these, prompt injection attacks are particularly threatening, where malicious instructions injected in the external text information can exploit LLMs to generate answers as the attackers desire. While both training-time and test-time defense methods have been developed to mitigate such attacks, the unaffordable training costs associated with training-time methods and the limited effectiveness of existing test-time methods make them impractical. This paper introduces a novel test-time defense strategy, named Formatting AuThentication with Hash-based tags (FATH). Unlike existing approaches that prevent LLMs from answering additional instructions in external text, our method implements an authentication system, requiring LLMs to answer all received instructions with a security policy and selectively filter out responses to user instructions as the final output. To achieve this, we utilize hash-based authentication tags to label each response, facilitating accurate identification of responses according to the user's instructions and improving the robustness against adaptive attacks. Comprehensive experiments demonstrate that our defense method can effectively defend against indirect prompt injection attacks, achieving state-of-the-art performance under Llama3 and GPT3.5 models across various attack methods. Our code is released at: https://github.com/Jayfeather1024/FATH <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21492v1-abstract-full').style.display = 'none'; document.getElementById('2410.21492v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.21337">arXiv:2410.21337</a> <span> </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Fine-tuned Large Language Models (LLMs): Improved Prompt Injection Attacks Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Rahman%2C+M+A">Md Abdur Rahman</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+F">Fan Wu</a>, <a href="/search/cs?searchtype=author&query=Cuzzocrea%2C+A">Alfredo Cuzzocrea</a>, <a href="/search/cs?searchtype=author&query=Ahamed%2C+S+I">Sheikh Iqbal Ahamed</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.21337v2-abstract-short" style="display: inline;"> Large language models (LLMs) are becoming a popular tool as they have significantly advanced in their capability to tackle a wide range of language-based tasks. However, LLMs applications are highly vulnerable to prompt injection attacks, which poses a critical problem. These attacks target LLMs applications through using carefully designed input prompts to divert the model from adhering to origin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21337v2-abstract-full').style.display = 'inline'; document.getElementById('2410.21337v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.21337v2-abstract-full" style="display: none;"> Large language models (LLMs) are becoming a popular tool as they have significantly advanced in their capability to tackle a wide range of language-based tasks. However, LLMs applications are highly vulnerable to prompt injection attacks, which poses a critical problem. These attacks target LLMs applications through using carefully designed input prompts to divert the model from adhering to original instruction, thereby it could execute unintended actions. These manipulations pose serious security threats which potentially results in data leaks, biased outputs, or harmful responses. This project explores the security vulnerabilities in relation to prompt injection attacks. To detect whether a prompt is vulnerable or not, we follows two approaches: 1) a pre-trained LLM, and 2) a fine-tuned LLM. Then, we conduct a thorough analysis and comparison of the classification performance. Firstly, we use pre-trained XLM-RoBERTa model to detect prompt injections using test dataset without any fine-tuning and evaluate it by zero-shot classification. Then, this proposed work will apply supervised fine-tuning to this pre-trained LLM using a task-specific labeled dataset from deepset in huggingface, and this fine-tuned model achieves impressive results with 99.13\% accuracy, 100\% precision, 98.33\% recall and 99.15\% F1-score thorough rigorous experimentation and evaluation. We observe that our approach is highly efficient in detecting prompt injection attacks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21337v2-abstract-full').style.display = 'none'; document.getElementById('2410.21337v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">I am requesting the withdrawal of my paper due to critical issues identified in the methodology/results that may impact its accuracy and reliability. I also plan to make substantial revisions that go beyond minor corrections</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20664">arXiv:2410.20664</a> <span> </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Embedding with Large Language Models for Classification of HIPAA Safeguard Compliance Rules </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Rahman%2C+M+A">Md Abdur Rahman</a>, <a href="/search/cs?searchtype=author&query=Barek%2C+M+A">Md Abdul Barek</a>, <a href="/search/cs?searchtype=author&query=Riad%2C+A+K+I">ABM Kamrul Islam Riad</a>, <a href="/search/cs?searchtype=author&query=Rahman%2C+M+M">Md Mostafizur Rahman</a>, <a href="/search/cs?searchtype=author&query=Rashid%2C+M+B">Md Bajlur Rashid</a>, <a href="/search/cs?searchtype=author&query=Ambedkar%2C+S">Smita Ambedkar</a>, <a href="/search/cs?searchtype=author&query=Miaa%2C+M+R">Md Raihan Miaa</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+F">Fan Wu</a>, <a href="/search/cs?searchtype=author&query=Cuzzocrea%2C+A">Alfredo Cuzzocrea</a>, <a href="/search/cs?searchtype=author&query=Ahamed%2C+S+I">Sheikh Iqbal Ahamed</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20664v2-abstract-short" style="display: inline;"> Although software developers of mHealth apps are responsible for protecting patient data and adhering to strict privacy and security requirements, many of them lack awareness of HIPAA regulations and struggle to distinguish between HIPAA rules categories. Therefore, providing guidance of HIPAA rules patterns classification is essential for developing secured applications for Google Play Store. In… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20664v2-abstract-full').style.display = 'inline'; document.getElementById('2410.20664v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20664v2-abstract-full" style="display: none;"> Although software developers of mHealth apps are responsible for protecting patient data and adhering to strict privacy and security requirements, many of them lack awareness of HIPAA regulations and struggle to distinguish between HIPAA rules categories. Therefore, providing guidance of HIPAA rules patterns classification is essential for developing secured applications for Google Play Store. In this work, we identified the limitations of traditional Word2Vec embeddings in processing code patterns. To address this, we adopt multilingual BERT (Bidirectional Encoder Representations from Transformers) which offers contextualized embeddings to the attributes of dataset to overcome the issues. Therefore, we applied this BERT to our dataset for embedding code patterns and then uses these embedded code to various machine learning approaches. Our results demonstrate that the models significantly enhances classification performance, with Logistic Regression achieving a remarkable accuracy of 99.95\%. Additionally, we obtained high accuracy from Support Vector Machine (99.79\%), Random Forest (99.73\%), and Naive Bayes (95.93\%), outperforming existing approaches. This work underscores the effectiveness and showcases its potential for secure application development. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20664v2-abstract-full').style.display = 'none'; document.getElementById('2410.20664v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">I am requesting the withdrawal of my paper due to critical issues identified in the methodology/results that may impact its accuracy and reliability. I also plan to make substantial revisions that go beyond minor corrections</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.19241">arXiv:2410.19241</a> <span> [<a href="https://arxiv.org/pdf/2410.19241">pdf</a>, <a href="https://arxiv.org/format/2410.19241">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Exchange Rate Forecasting with Explainable Deep Learning Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Meng%2C+S">Shuchen Meng</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+A">Andi Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chihang Wang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+M">Mengyao Zheng</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+F">Fangyu Wu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xupeng Chen</a>, <a href="/search/cs?searchtype=author&query=Ni%2C+H">Haowei Ni</a>, <a href="/search/cs?searchtype=author&query=Li%2C+P">Panfeng Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.19241v2-abstract-short" style="display: inline;"> Accurate exchange rate prediction is fundamental to financial stability and international trade, positioning it as a critical focus in economic and financial research. Traditional forecasting models often falter when addressing the inherent complexities and non-linearities of exchange rate data. This study explores the application of advanced deep learning models, including LSTM, CNN, and transfor… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19241v2-abstract-full').style.display = 'inline'; document.getElementById('2410.19241v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.19241v2-abstract-full" style="display: none;"> Accurate exchange rate prediction is fundamental to financial stability and international trade, positioning it as a critical focus in economic and financial research. Traditional forecasting models often falter when addressing the inherent complexities and non-linearities of exchange rate data. This study explores the application of advanced deep learning models, including LSTM, CNN, and transformer-based architectures, to enhance the predictive accuracy of the RMB/USD exchange rate. Utilizing 40 features across 6 categories, the analysis identifies TSMixer as the most effective model for this task. A rigorous feature selection process emphasizes the inclusion of key economic indicators, such as China-U.S. trade volumes and exchange rates of other major currencies like the euro-RMB and yen-dollar pairs. The integration of grad-CAM visualization techniques further enhances model interpretability, allowing for clearer identification of the most influential features and bolstering the credibility of the predictions. These findings underscore the pivotal role of fundamental economic data in exchange rate forecasting and highlight the substantial potential of machine learning models to deliver more accurate and reliable predictions, thereby serving as a valuable tool for financial analysis and decision-making. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19241v2-abstract-full').style.display = 'none'; document.getElementById('2410.19241v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by 2024 5th International Conference on Machine Learning and Computer Application</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18378">arXiv:2410.18378</a> <span> [<a href="https://arxiv.org/pdf/2410.18378">pdf</a>, <a href="https://arxiv.org/format/2410.18378">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Delta: A Cloud-assisted Data Enrichment Framework for On-Device Continual Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gong%2C+C">Chen Gong</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Z">Zhenzhe Zheng</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+F">Fan Wu</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+X">Xiaofeng Jia</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+G">Guihai Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18378v1-abstract-short" style="display: inline;"> In modern mobile applications, users frequently encounter various new contexts, necessitating on-device continual learning (CL) to ensure consistent model performance. While existing research predominantly focused on developing lightweight CL frameworks, we identify that data scarcity is a critical bottleneck for on-device CL. In this work, we explore the potential of leveraging abundant cloud-sid… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18378v1-abstract-full').style.display = 'inline'; document.getElementById('2410.18378v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18378v1-abstract-full" style="display: none;"> In modern mobile applications, users frequently encounter various new contexts, necessitating on-device continual learning (CL) to ensure consistent model performance. While existing research predominantly focused on developing lightweight CL frameworks, we identify that data scarcity is a critical bottleneck for on-device CL. In this work, we explore the potential of leveraging abundant cloud-side data to enrich scarce on-device data, and propose a private, efficient and effective data enrichment framework Delta. Specifically, Delta first introduces a directory dataset to decompose the data enrichment problem into device-side and cloud-side sub-problems without sharing sensitive data. Next, Delta proposes a soft data matching strategy to effectively solve the device-side sub-problem with sparse user data, and an optimal data sampling scheme for cloud server to retrieve the most suitable dataset for enrichment with low computational complexity. Further, Delta refines the data sampling scheme by jointly considering the impact of enriched data on both new and past contexts, mitigating the catastrophic forgetting issue from a new aspect. Comprehensive experiments across four typical mobile computing tasks with varied data modalities demonstrate that Delta could enhance the overall model accuracy by an average of 15.1%, 12.4%, 1.1% and 5.6% for visual, IMU, audio and textual tasks compared with few-shot CL, and consistently reduce the communication costs by over 90% compared to federated CL. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18378v1-abstract-full').style.display = 'none'; document.getElementById('2410.18378v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.17247">arXiv:2410.17247</a> <span> [<a href="https://arxiv.org/pdf/2410.17247">pdf</a>, <a href="https://arxiv.org/format/2410.17247">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> PyramidDrop: Accelerating Your Large Vision-Language Models via Pyramid Visual Redundancy Reduction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xing%2C+L">Long Xing</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Q">Qidong Huang</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+X">Xiaoyi Dong</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+J">Jiajie Lu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+P">Pan Zhang</a>, <a href="/search/cs?searchtype=author&query=Zang%2C+Y">Yuhang Zang</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+Y">Yuhang Cao</a>, <a href="/search/cs?searchtype=author&query=He%2C+C">Conghui He</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jiaqi Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+F">Feng Wu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+D">Dahua Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.17247v1-abstract-short" style="display: inline;"> In large vision-language models (LVLMs), images serve as inputs that carry a wealth of information. As the idiom "A picture is worth a thousand words" implies, representing a single image in current LVLMs can require hundreds or even thousands of tokens. This results in significant computational costs, which grow quadratically as input image resolution increases, thereby severely impacting the eff… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17247v1-abstract-full').style.display = 'inline'; document.getElementById('2410.17247v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.17247v1-abstract-full" style="display: none;"> In large vision-language models (LVLMs), images serve as inputs that carry a wealth of information. As the idiom "A picture is worth a thousand words" implies, representing a single image in current LVLMs can require hundreds or even thousands of tokens. This results in significant computational costs, which grow quadratically as input image resolution increases, thereby severely impacting the efficiency of both training and inference. Previous approaches have attempted to reduce the number of image tokens either before or within the early layers of LVLMs. However, these strategies inevitably result in the loss of crucial image information, ultimately diminishing model performance. To address this challenge, we conduct an empirical study revealing that all visual tokens are necessary for LVLMs in the shallow layers, and token redundancy progressively increases in the deeper layers of the model. To this end, we propose PyramidDrop, a visual redundancy reduction strategy for LVLMs to boost their efficiency in both training and inference with neglectable performance loss. Specifically, we partition the LVLM into several stages and drop part of the image tokens at the end of each stage with a pre-defined ratio, creating pyramid-like visual tokens across model layers. The dropping is based on a lightweight similarity calculation with a negligible time overhead. Extensive experiments demonstrate that PyramidDrop can achieve a 40% training time and 55% inference FLOPs acceleration of LLaVA-NeXT with comparable performance. Besides, the PyramidDrop could also serve as a plug-and-play strategy for inference acceleration without training, with better performance and lower inference cost than counterparts. We hope that the insights and approach introduced by PyramidDrop will inspire future research to further investigate the role of image tokens in LVLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17247v1-abstract-full').style.display = 'none'; document.getElementById('2410.17247v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.17243">arXiv:2410.17243</a> <span> [<a href="https://arxiv.org/pdf/2410.17243">pdf</a>, <a href="https://arxiv.org/format/2410.17243">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Breaking the Memory Barrier: Near Infinite Batch Size Scaling for Contrastive Loss </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cheng%2C+Z">Zesen Cheng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hang Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+K">Kehan Li</a>, <a href="/search/cs?searchtype=author&query=Leng%2C+S">Sicong Leng</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Z">Zhiqiang Hu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+F">Fei Wu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+D">Deli Zhao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xin Li</a>, <a href="/search/cs?searchtype=author&query=Bing%2C+L">Lidong Bing</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.17243v1-abstract-short" style="display: inline;"> Contrastive loss is a powerful approach for representation learning, where larger batch sizes enhance performance by providing more negative samples to better distinguish between similar and dissimilar data. However, scaling batch sizes is constrained by the quadratic growth in GPU memory consumption, primarily due to the full instantiation of the similarity matrix. To address this, we propose a t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17243v1-abstract-full').style.display = 'inline'; document.getElementById('2410.17243v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.17243v1-abstract-full" style="display: none;"> Contrastive loss is a powerful approach for representation learning, where larger batch sizes enhance performance by providing more negative samples to better distinguish between similar and dissimilar data. However, scaling batch sizes is constrained by the quadratic growth in GPU memory consumption, primarily due to the full instantiation of the similarity matrix. To address this, we propose a tile-based computation strategy that partitions the contrastive loss calculation into arbitrary small blocks, avoiding full materialization of the similarity matrix. Furthermore, we introduce a multi-level tiling strategy to leverage the hierarchical structure of distributed systems, employing ring-based communication at the GPU level to optimize synchronization and fused kernels at the CUDA core level to reduce I/O overhead. Experimental results show that the proposed method scales batch sizes to unprecedented levels. For instance, it enables contrastive training of a CLIP-ViT-L/14 model with a batch size of 4M or 12M using 8 or 32 A800 80GB without sacrificing any accuracy. Compared to SOTA memory-efficient solutions, it achieves a two-order-of-magnitude reduction in memory while maintaining comparable speed. The code will be made publicly available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17243v1-abstract-full').style.display = 'none'; document.getElementById('2410.17243v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.15595">arXiv:2410.15595</a> <span> [<a href="https://arxiv.org/pdf/2410.15595">pdf</a>, <a href="https://arxiv.org/ps/2410.15595">ps</a>, <a href="https://arxiv.org/format/2410.15595">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> A Comprehensive Survey of Direct Preference Optimization: Datasets, Theories, Variants, and Applications </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiao%2C+W">Wenyi Xiao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zechuan Wang</a>, <a href="/search/cs?searchtype=author&query=Gan%2C+L">Leilei Gan</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Shuai Zhao</a>, <a href="/search/cs?searchtype=author&query=He%2C+W">Wanggui He</a>, <a href="/search/cs?searchtype=author&query=Tuan%2C+L+A">Luu Anh Tuan</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Long Chen</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+H">Hao Jiang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Zhou Zhao</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+F">Fei Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.15595v2-abstract-short" style="display: inline;"> With the rapid advancement of large language models (LLMs), aligning policy models with human preferences has become increasingly critical. Direct Preference Optimization (DPO) has emerged as a promising approach for alignment, acting as an RL-free alternative to Reinforcement Learning from Human Feedback (RLHF). Despite DPO's various advancements and inherent limitations, an in-depth review of th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15595v2-abstract-full').style.display = 'inline'; document.getElementById('2410.15595v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.15595v2-abstract-full" style="display: none;"> With the rapid advancement of large language models (LLMs), aligning policy models with human preferences has become increasingly critical. Direct Preference Optimization (DPO) has emerged as a promising approach for alignment, acting as an RL-free alternative to Reinforcement Learning from Human Feedback (RLHF). Despite DPO's various advancements and inherent limitations, an in-depth review of these aspects is currently lacking in the literature. In this work, we present a comprehensive review of the challenges and opportunities in DPO, covering theoretical analyses, variants, relevant preference datasets, and applications. Specifically, we categorize recent studies on DPO based on key research questions to provide a thorough understanding of DPO's current landscape. Additionally, we propose several future research directions to offer insights on model alignment for the research community. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15595v2-abstract-full').style.display = 'none'; document.getElementById('2410.15595v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.15319">arXiv:2410.15319</a> <span> [<a href="https://arxiv.org/pdf/2410.15319">pdf</a>, <a href="https://arxiv.org/format/2410.15319">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Causality for Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+A">Anpeng Wu</a>, <a href="/search/cs?searchtype=author&query=Kuang%2C+K">Kun Kuang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+M">Minqin Zhu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yingrong Wang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Y">Yujia Zheng</a>, <a href="/search/cs?searchtype=author&query=Han%2C+K">Kairong Han</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Baohong Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+G">Guangyi Chen</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+F">Fei Wu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kun Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.15319v1-abstract-short" style="display: inline;"> Recent breakthroughs in artificial intelligence have driven a paradigm shift, where large language models (LLMs) with billions or trillions of parameters are trained on vast datasets, achieving unprecedented success across a series of language tasks. However, despite these successes, LLMs still rely on probabilistic modeling, which often captures spurious correlations rooted in linguistic patterns… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15319v1-abstract-full').style.display = 'inline'; document.getElementById('2410.15319v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.15319v1-abstract-full" style="display: none;"> Recent breakthroughs in artificial intelligence have driven a paradigm shift, where large language models (LLMs) with billions or trillions of parameters are trained on vast datasets, achieving unprecedented success across a series of language tasks. However, despite these successes, LLMs still rely on probabilistic modeling, which often captures spurious correlations rooted in linguistic patterns and social stereotypes, rather than the true causal relationships between entities and events. This limitation renders LLMs vulnerable to issues such as demographic biases, social stereotypes, and LLM hallucinations. These challenges highlight the urgent need to integrate causality into LLMs, moving beyond correlation-driven paradigms to build more reliable and ethically aligned AI systems. While many existing surveys and studies focus on utilizing prompt engineering to activate LLMs for causal knowledge or developing benchmarks to assess their causal reasoning abilities, most of these efforts rely on human intervention to activate pre-trained models. How to embed causality into the training process of LLMs and build more general and intelligent models remains unexplored. Recent research highlights that LLMs function as causal parrots, capable of reciting causal knowledge without truly understanding or applying it. These prompt-based methods are still limited to human interventional improvements. This survey aims to address this gap by exploring how causality can enhance LLMs at every stage of their lifecycle-from token embedding learning and foundation model training to fine-tuning, alignment, inference, and evaluation-paving the way for more interpretable, reliable, and causally-informed models. Additionally, we further outline six promising future directions to advance LLM development, enhance their causal reasoning capabilities, and address the current limitations these models face. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15319v1-abstract-full').style.display = 'none'; document.getElementById('2410.15319v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.15116">arXiv:2410.15116</a> <span> [<a href="https://arxiv.org/pdf/2410.15116">pdf</a>, <a href="https://arxiv.org/format/2410.15116">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Coarse-to-Fine Highlighting: Reducing Knowledge Hallucination in Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lv%2C+Q">Qitan Lv</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jie Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hanzhu Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Bin Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yongdong Zhang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+F">Feng Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.15116v1-abstract-short" style="display: inline;"> Generation of plausible but incorrect factual information, often termed hallucination, has attracted significant research interest. Retrieval-augmented language model (RALM) -- which enhances models with up-to-date knowledge -- emerges as a promising method to reduce hallucination. However, existing RALMs may instead exacerbate hallucination when retrieving lengthy contexts. To address this challe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15116v1-abstract-full').style.display = 'inline'; document.getElementById('2410.15116v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.15116v1-abstract-full" style="display: none;"> Generation of plausible but incorrect factual information, often termed hallucination, has attracted significant research interest. Retrieval-augmented language model (RALM) -- which enhances models with up-to-date knowledge -- emerges as a promising method to reduce hallucination. However, existing RALMs may instead exacerbate hallucination when retrieving lengthy contexts. To address this challenge, we propose COFT, a novel \textbf{CO}arse-to-\textbf{F}ine highligh\textbf{T}ing method to focus on different granularity-level key texts, thereby avoiding getting lost in lengthy contexts. Specifically, COFT consists of three components: \textit{recaller}, \textit{scorer}, and \textit{selector}. First, \textit{recaller} applies a knowledge graph to extract potential key entities in a given context. Second, \textit{scorer} measures the importance of each entity by calculating its contextual weight. Finally, \textit{selector} selects high contextual weight entities with a dynamic threshold algorithm and highlights the corresponding paragraphs, sentences, or words in a coarse-to-fine manner. Extensive experiments on the knowledge hallucination benchmark demonstrate the effectiveness of COFT, leading to a superior performance over $30\%$ in the F1 score metric. Moreover, COFT also exhibits remarkable versatility across various long-form tasks, such as reading comprehension and question answering. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15116v1-abstract-full').style.display = 'none'; document.getElementById('2410.15116v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.14743">arXiv:2410.14743</a> <span> [<a href="https://arxiv.org/pdf/2410.14743">pdf</a>, <a href="https://arxiv.org/format/2410.14743">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Efficient Deep Learning Board: Training Feedback Is Not All You Need </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gong%2C+L">Lina Gong</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Q">Qi Gao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+P">Peng Li</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+M">Mingqiang Wei</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+F">Fei Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.14743v1-abstract-short" style="display: inline;"> Current automatic deep learning (i.e., AutoDL) frameworks rely on training feedback from actual runs, which often hinder their ability to provide quick and clear performance predictions for selecting suitable DL systems. To address this issue, we propose EfficientDL, an innovative deep learning board designed for automatic performance prediction and component recommendation. EfficientDL can quickl… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14743v1-abstract-full').style.display = 'inline'; document.getElementById('2410.14743v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.14743v1-abstract-full" style="display: none;"> Current automatic deep learning (i.e., AutoDL) frameworks rely on training feedback from actual runs, which often hinder their ability to provide quick and clear performance predictions for selecting suitable DL systems. To address this issue, we propose EfficientDL, an innovative deep learning board designed for automatic performance prediction and component recommendation. EfficientDL can quickly and precisely recommend twenty-seven system components and predict the performance of DL models without requiring any training feedback. The magic of no training feedback comes from our proposed comprehensive, multi-dimensional, fine-grained system component dataset, which enables us to develop a static performance prediction model and comprehensive optimized component recommendation algorithm (i.e., 伪\b{eta}-BO search), removing the dependency on actually running parameterized models during the traditional optimization search process. The simplicity and power of EfficientDL stem from its compatibility with most DL models. For example, EfficientDL operates seamlessly with mainstream models such as ResNet50, MobileNetV3, EfficientNet-B0, MaxViT-T, Swin-B, and DaViT-T, bringing competitive performance improvements. Besides, experimental results on the CIFAR-10 dataset reveal that EfficientDL outperforms existing AutoML tools in both accuracy and efficiency (approximately 20 times faster along with 1.31% Top-1 accuracy improvement than the cutting-edge methods). Source code, pretrained models, and datasets are available at https://github.com/OpenSELab/EfficientDL. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14743v1-abstract-full').style.display = 'none'; document.getElementById('2410.14743v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.13978">arXiv:2410.13978</a> <span> [<a href="https://arxiv.org/pdf/2410.13978">pdf</a>, <a href="https://arxiv.org/ps/2410.13978">ps</a>, <a href="https://arxiv.org/format/2410.13978">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Theoretical Economics">econ.TH</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> </div> </div> <p class="title is-5 mathjax"> Incentivizing Information Acquisition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+F">Fan Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.13978v1-abstract-short" style="display: inline;"> I study a principal-agent model in which a principal hires an agent to collect information about an unknown continuous state. The agent acquires a signal whose distribution is centered around the state, controlling the signal's precision at a cost. The principal observes neither the precision nor the signal, but rather, using transfers that can depend on the state, incentivizes the agent to choose… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13978v1-abstract-full').style.display = 'inline'; document.getElementById('2410.13978v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.13978v1-abstract-full" style="display: none;"> I study a principal-agent model in which a principal hires an agent to collect information about an unknown continuous state. The agent acquires a signal whose distribution is centered around the state, controlling the signal's precision at a cost. The principal observes neither the precision nor the signal, but rather, using transfers that can depend on the state, incentivizes the agent to choose high precision and report the signal truthfully. I identify a sufficient and necessary condition on the agent's information structure which ensures that there exists an optimal transfer with a simple cutoff structure: the agent receives a fixed prize when his prediction is close enough to the state and receives nothing otherwise. This condition is mild and applies to all signal distributions commonly used in the literature. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13978v1-abstract-full').style.display = 'none'; document.getElementById('2410.13978v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.13910">arXiv:2410.13910</a> <span> [<a href="https://arxiv.org/pdf/2410.13910">pdf</a>, <a href="https://arxiv.org/format/2410.13910">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Mitigating the Backdoor Effect for Multi-Task Model Merging via Safety-Aware Subspace </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jinluan Yang</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+A">Anke Tang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+D">Didi Zhu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhengyu Chen</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+L">Li Shen</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+F">Fei Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.13910v1-abstract-short" style="display: inline;"> Model merging has gained significant attention as a cost-effective approach to integrate multiple single-task fine-tuned models into a unified one that can perform well on multiple tasks. However, existing model merging techniques primarily focus on resolving conflicts between task-specific models, they often overlook potential security threats, particularly the risk of backdoor attacks in the ope… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13910v1-abstract-full').style.display = 'inline'; document.getElementById('2410.13910v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.13910v1-abstract-full" style="display: none;"> Model merging has gained significant attention as a cost-effective approach to integrate multiple single-task fine-tuned models into a unified one that can perform well on multiple tasks. However, existing model merging techniques primarily focus on resolving conflicts between task-specific models, they often overlook potential security threats, particularly the risk of backdoor attacks in the open-source model ecosystem. In this paper, we first investigate the vulnerabilities of existing model merging methods to backdoor attacks, identifying two critical challenges: backdoor succession and backdoor transfer. To address these issues, we propose a novel Defense-Aware Merging (DAM) approach that simultaneously mitigates task interference and backdoor vulnerabilities. Specifically, DAM employs a meta-learning-based optimization method with dual masks to identify a shared and safety-aware subspace for model merging. These masks are alternately optimized: the Task-Shared mask identifies common beneficial parameters across tasks, aiming to preserve task-specific knowledge while reducing interference, while the Backdoor-Detection mask isolates potentially harmful parameters to neutralize security threats. This dual-mask design allows us to carefully balance the preservation of useful knowledge and the removal of potential vulnerabilities. Compared to existing merging methods, DAM achieves a more favorable balance between performance and security, reducing the attack success rate by 2-10 percentage points while sacrificing only about 1% in accuracy. Furthermore, DAM exhibits robust performance and broad applicability across various types of backdoor attacks and the number of compromised models involved in the merging process. We will release the codes and models soon. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13910v1-abstract-full').style.display = 'none'; document.getElementById('2410.13910v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">21 pages,8 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.13842">arXiv:2410.13842</a> <span> [<a href="https://arxiv.org/pdf/2410.13842">pdf</a>, <a href="https://arxiv.org/format/2410.13842">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> D-FINE: Redefine Regression Task in DETRs as Fine-grained Distribution Refinement </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Peng%2C+Y">Yansong Peng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Hebei Li</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+P">Peixi Wu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yueyi Zhang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+X">Xiaoyan Sun</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+F">Feng Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.13842v1-abstract-short" style="display: inline;"> We introduce D-FINE, a powerful real-time object detector that achieves outstanding localization precision by redefining the bounding box regression task in DETR models. D-FINE comprises two key components: Fine-grained Distribution Refinement (FDR) and Global Optimal Localization Self-Distillation (GO-LSD). FDR transforms the regression process from predicting fixed coordinates to iteratively ref… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13842v1-abstract-full').style.display = 'inline'; document.getElementById('2410.13842v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.13842v1-abstract-full" style="display: none;"> We introduce D-FINE, a powerful real-time object detector that achieves outstanding localization precision by redefining the bounding box regression task in DETR models. D-FINE comprises two key components: Fine-grained Distribution Refinement (FDR) and Global Optimal Localization Self-Distillation (GO-LSD). FDR transforms the regression process from predicting fixed coordinates to iteratively refining probability distributions, providing a fine-grained intermediate representation that significantly enhances localization accuracy. GO-LSD is a bidirectional optimization strategy that transfers localization knowledge from refined distributions to shallower layers through self-distillation, while also simplifying the residual prediction tasks for deeper layers. Additionally, D-FINE incorporates lightweight optimizations in computationally intensive modules and operations, achieving a better balance between speed and accuracy. Specifically, D-FINE-L / X achieves 54.0% / 55.8% AP on the COCO dataset at 124 / 78 FPS on an NVIDIA T4 GPU. When pretrained on Objects365, D-FINE-L / X attains 57.1% / 59.3% AP, surpassing all existing real-time detectors. Furthermore, our method significantly enhances the performance of a wide range of DETR models by up to 5.3% AP with negligible extra parameters and training costs. Our code and pretrained models: https://github.com/Peterande/D-FINE. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13842v1-abstract-full').style.display = 'none'; document.getElementById('2410.13842v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.13699">arXiv:2410.13699</a> <span> [<a href="https://arxiv.org/pdf/2410.13699">pdf</a>, <a href="https://arxiv.org/format/2410.13699">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Unconstrained Model Merging for Enhanced LLM Reasoning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yiming Zhang</a>, <a href="/search/cs?searchtype=author&query=He%2C+B">Baoyi He</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shengyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+Y">Yuhao Fu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Q">Qi Zhou</a>, <a href="/search/cs?searchtype=author&query=Sang%2C+Z">Zhijie Sang</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+Z">Zijin Hong</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+K">Kejing Yang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wenjun Wang</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+J">Jianbo Yuan</a>, <a href="/search/cs?searchtype=author&query=Ning%2C+G">Guanghan Ning</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Linyi Li</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+C">Chunlin Ji</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+F">Fei Wu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+H">Hongxia Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.13699v2-abstract-short" style="display: inline;"> Recent advancements in building domain-specific large language models (LLMs) have shown remarkable success, especially in tasks requiring reasoning abilities like logical inference over complex relationships and multi-step problem solving. However, creating a powerful all-in-one LLM remains challenging due to the need for proprietary data and vast computational resources. As a resource-friendly al… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13699v2-abstract-full').style.display = 'inline'; document.getElementById('2410.13699v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.13699v2-abstract-full" style="display: none;"> Recent advancements in building domain-specific large language models (LLMs) have shown remarkable success, especially in tasks requiring reasoning abilities like logical inference over complex relationships and multi-step problem solving. However, creating a powerful all-in-one LLM remains challenging due to the need for proprietary data and vast computational resources. As a resource-friendly alternative, we explore the potential of merging multiple expert models into a single LLM. Existing studies on model merging mainly focus on generalist LLMs instead of domain experts, or the LLMs under the same architecture and size. In this work, we propose an unconstrained model merging framework that accommodates both homogeneous and heterogeneous model architectures with a focus on reasoning tasks. A fine-grained layer-wise weight merging strategy is designed for homogeneous models merging, while heterogeneous model merging is built upon the probabilistic distribution knowledge derived from instruction-response fine-tuning data. Across 7 benchmarks and 9 reasoning-optimized LLMs, we reveal key findings that combinatorial reasoning emerges from merging which surpasses simple additive effects. We propose that unconstrained model merging could serve as a foundation for decentralized LLMs, marking a notable progression from the existing centralized LLM framework. This evolution could enhance wider participation and stimulate additional advancement in the field of artificial intelligence, effectively addressing the constraints posed by centralized models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13699v2-abstract-full').style.display = 'none'; document.getElementById('2410.13699v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Under review, correct typos</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.12476">arXiv:2410.12476</a> <span> [<a href="https://arxiv.org/pdf/2410.12476">pdf</a>, <a href="https://arxiv.org/format/2410.12476">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Retrieval-Reasoning Large Language Model-based Synthetic Clinical Trial Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zerui Xu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+F">Fang Wu</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+T">Tianfan Fu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yue Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.12476v1-abstract-short" style="display: inline;"> Machine learning (ML) exhibits promise in the clinical domain. However, it is constrained by data scarcity and ethical considerations, as the generation of clinical trials presents significant challenges due to stringent privacy regulations, high costs, and the extended duration required for conducting studies with human participants. Despite the advancements of large language models (LLMs) in gen… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12476v1-abstract-full').style.display = 'inline'; document.getElementById('2410.12476v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.12476v1-abstract-full" style="display: none;"> Machine learning (ML) exhibits promise in the clinical domain. However, it is constrained by data scarcity and ethical considerations, as the generation of clinical trials presents significant challenges due to stringent privacy regulations, high costs, and the extended duration required for conducting studies with human participants. Despite the advancements of large language models (LLMs) in general generation tasks, their potential in facilitating the generation of synthetic clinical trials is under-explored. To address this gap, we introduce a novel Retrieval-Reasoning few-shot framework that leverages LLMs to generate artificial yet realistic and diverse clinical trials with binary success/failure labels. Experiments conducted on real clinical trials from the \url{ClinicalTrials.gov} database demonstrate that our synthetic data can effectively augment real datasets. Furthermore, by fine-tuning a pre-trained model as a binary classifier on synthetic clinical trial datasets, we demonstrate that this augmentation enhances model training for downstream tasks such as trial outcome prediction. Our findings suggest that LLMs for synthetic clinical trial generation hold promise for accelerating clinical research and upholding ethical standards for patient privacy. The code is publicly available at https://anonymous.4open.science/r/Retrieval_Reasoning_Clinical_Trial_Generation-3EC4. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12476v1-abstract-full').style.display = 'none'; document.getElementById('2410.12476v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.11239">arXiv:2410.11239</a> <span> [<a href="https://arxiv.org/pdf/2410.11239">pdf</a>, <a href="https://arxiv.org/format/2410.11239">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> HR-Agent: A Task-Oriented Dialogue (TOD) LLM Agent Tailored for HR Applications </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+W">Weijie Xu</a>, <a href="/search/cs?searchtype=author&query=Desai%2C+J">Jay Desai</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+F">Fanyou Wu</a>, <a href="/search/cs?searchtype=author&query=Valvoda%2C+J">Josef Valvoda</a>, <a href="/search/cs?searchtype=author&query=Sengamedu%2C+S+H">Srinivasan H. Sengamedu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.11239v1-abstract-short" style="display: inline;"> Recent LLM (Large Language Models) advancements benefit many fields such as education and finance, but HR has hundreds of repetitive processes, such as access requests, medical claim filing and time-off submissions, which are unaddressed. We relate these tasks to the LLM agent, which has addressed tasks such as writing assisting and customer support. We present HR-Agent, an efficient, confidential… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11239v1-abstract-full').style.display = 'inline'; document.getElementById('2410.11239v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.11239v1-abstract-full" style="display: none;"> Recent LLM (Large Language Models) advancements benefit many fields such as education and finance, but HR has hundreds of repetitive processes, such as access requests, medical claim filing and time-off submissions, which are unaddressed. We relate these tasks to the LLM agent, which has addressed tasks such as writing assisting and customer support. We present HR-Agent, an efficient, confidential, and HR-specific LLM-based task-oriented dialogue system tailored for automating repetitive HR processes such as medical claims and access requests. Since conversation data is not sent to an LLM during inference, it preserves confidentiality required in HR-related tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11239v1-abstract-full').style.display = 'none'; document.getElementById('2410.11239v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68T07 <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.7 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.07278">arXiv:2410.07278</a> <span> [<a href="https://arxiv.org/pdf/2410.07278">pdf</a>, <a href="https://arxiv.org/format/2410.07278">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Retrieval Replace Reduction: An effective visual token reduction method via semantic match </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yingen Liu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+F">Fan Wu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+R">Ruihui Li</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Z">Zhuo Tang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+K">Kenli Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.07278v1-abstract-short" style="display: inline;"> Multimodal large language models (MLLMs) have demonstrated strong performance across various tasks without requiring training from scratch. However, they face significant computational and memory constraints, particularly when processing multimodal inputs that exceed context length, limiting their scalability. In this paper, we introduce a new approach, \textbf{TRSM} (\textbf{T}oken \textbf{R}educ… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07278v1-abstract-full').style.display = 'inline'; document.getElementById('2410.07278v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.07278v1-abstract-full" style="display: none;"> Multimodal large language models (MLLMs) have demonstrated strong performance across various tasks without requiring training from scratch. However, they face significant computational and memory constraints, particularly when processing multimodal inputs that exceed context length, limiting their scalability. In this paper, we introduce a new approach, \textbf{TRSM} (\textbf{T}oken \textbf{R}eduction via \textbf{S}emantic \textbf{M}atch), which effectively reduces the number of visual tokens without compromising MLLM performance. Inspired by how humans process multimodal tasks, TRSM leverages semantic information from one modality to match relevant semantics in another, reducing the number of visual tokens.Specifically, to retain task relevant visual tokens, we use the text prompt as a query vector to retrieve the most similar vectors from the visual prompt and merge them with the text tokens. Based on experimental results, when applied to LLaVA-1.5\cite{liu2023}, our approach compresses the visual tokens by 20\%, achieving comparable performance across diverse visual question-answering and reasoning tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07278v1-abstract-full').style.display = 'none'; document.getElementById('2410.07278v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 2 figures,3 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.06934">arXiv:2410.06934</a> <span> [<a href="https://arxiv.org/pdf/2410.06934">pdf</a>, <a href="https://arxiv.org/format/2410.06934">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> VEC-Sim: A Simulation Platform for Evaluating Service Caching and Computation Offloading Policies in Vehicular Edge Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+F">Fan Wu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xiaolong Xu</a>, <a href="/search/cs?searchtype=author&query=Bilal%2C+M">Muhammad Bilal</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiangwei Wang</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+H">Hao Cheng</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+S">Siyu Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.06934v1-abstract-short" style="display: inline;"> Computer simulation platforms offer an alternative solution by emulating complex systems in a controlled manner. However, existing Edge Computing (EC) simulators, as well as general-purpose vehicular network simulators, are not tailored for VEC and lack dedicated support for modeling the distinct access pattern, entity mobility trajectory and other unique characteristics of VEC networks. To fill t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06934v1-abstract-full').style.display = 'inline'; document.getElementById('2410.06934v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.06934v1-abstract-full" style="display: none;"> Computer simulation platforms offer an alternative solution by emulating complex systems in a controlled manner. However, existing Edge Computing (EC) simulators, as well as general-purpose vehicular network simulators, are not tailored for VEC and lack dedicated support for modeling the distinct access pattern, entity mobility trajectory and other unique characteristics of VEC networks. To fill this gap, this paper proposes VEC-Sim, a versatile simulation platform for in-depth evaluation and analysis of various service caching and computation offloading policies in VEC networks. VEC-Sim incorporates realistic mechanisms to replicate real-world access patterns, including service feature vector, vehicle mobility modeling, evolving service popularity, new service upload and user preference shifts, etc. Moreover, its modular architecture and extensive Application Programming Interfaces (APIs) allow seamless integration of customized scheduling policies and user-defined metrics. A comprehensive evaluation of VEC-Sim's capabilities is undertaken in comparison to real-world ground truths. Results prove it to be accurate in reproducing classical scheduling algorithms and extremely effective in conducting case studies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06934v1-abstract-full').style.display = 'none'; document.getElementById('2410.06934v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.06072">arXiv:2410.06072</a> <span> [<a href="https://arxiv.org/pdf/2410.06072">pdf</a>, <a href="https://arxiv.org/format/2410.06072">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Training-free LLM-generated Text Detection by Mining Token Probability Sequences </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yihuai Xu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yongwei Wang</a>, <a href="/search/cs?searchtype=author&query=Bi%2C+Y">Yifei Bi</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+H">Huangsen Cao</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Z">Zhouhan Lin</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yu Zhao</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+F">Fei Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.06072v1-abstract-short" style="display: inline;"> Large language models (LLMs) have demonstrated remarkable capabilities in generating high-quality texts across diverse domains. However, the potential misuse of LLMs has raised significant concerns, underscoring the urgent need for reliable detection of LLM-generated texts. Conventional training-based detectors often struggle with generalization, particularly in cross-domain and cross-model scenar… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06072v1-abstract-full').style.display = 'inline'; document.getElementById('2410.06072v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.06072v1-abstract-full" style="display: none;"> Large language models (LLMs) have demonstrated remarkable capabilities in generating high-quality texts across diverse domains. However, the potential misuse of LLMs has raised significant concerns, underscoring the urgent need for reliable detection of LLM-generated texts. Conventional training-based detectors often struggle with generalization, particularly in cross-domain and cross-model scenarios. In contrast, training-free methods, which focus on inherent discrepancies through carefully designed statistical features, offer improved generalization and interpretability. Despite this, existing training-free detection methods typically rely on global text sequence statistics, neglecting the modeling of local discriminative features, thereby limiting their detection efficacy. In this work, we introduce a novel training-free detector, termed \textbf{Lastde} that synergizes local and global statistics for enhanced detection. For the first time, we introduce time series analysis to LLM-generated text detection, capturing the temporal dynamics of token probability sequences. By integrating these local statistics with global ones, our detector reveals significant disparities between human and LLM-generated texts. We also propose an efficient alternative, \textbf{Lastde++} to enable real-time detection. Extensive experiments on six datasets involving cross-domain, cross-model, and cross-lingual detection scenarios, under both white-box and black-box settings, demonstrated that our method consistently achieves state-of-the-art performance. Furthermore, our approach exhibits greater robustness against paraphrasing attacks compared to existing baseline methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06072v1-abstract-full').style.display = 'none'; document.getElementById('2410.06072v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.06044">arXiv:2410.06044</a> <span> [<a href="https://arxiv.org/pdf/2410.06044">pdf</a>, <a href="https://arxiv.org/format/2410.06044">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> HyperDet: Generalizable Detection of Synthesized Images by Generating and Merging A Mixture of Hyper LoRAs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cao%2C+H">Huangsen Cao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yongwei Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yinfeng Liu</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+S">Sixian Zheng</a>, <a href="/search/cs?searchtype=author&query=Lv%2C+K">Kangtao Lv</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhimeng Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Bo Zhang</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+X">Xin Ding</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+F">Fei Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.06044v1-abstract-short" style="display: inline;"> The emergence of diverse generative vision models has recently enabled the synthesis of visually realistic images, underscoring the critical need for effectively detecting these generated images from real photos. Despite advances in this field, existing detection approaches often struggle to accurately identify synthesized images generated by different generative models. In this work, we introduce… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06044v1-abstract-full').style.display = 'inline'; document.getElementById('2410.06044v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.06044v1-abstract-full" style="display: none;"> The emergence of diverse generative vision models has recently enabled the synthesis of visually realistic images, underscoring the critical need for effectively detecting these generated images from real photos. Despite advances in this field, existing detection approaches often struggle to accurately identify synthesized images generated by different generative models. In this work, we introduce a novel and generalizable detection framework termed HyperDet, which innovatively captures and integrates shared knowledge from a collection of functionally distinct and lightweight expert detectors. HyperDet leverages a large pretrained vision model to extract general detection features while simultaneously capturing and enhancing task-specific features. To achieve this, HyperDet first groups SRM filters into five distinct groups to efficiently capture varying levels of pixel artifacts based on their different functionality and complexity. Then, HyperDet utilizes a hypernetwork to generate LoRA model weights with distinct embedding parameters. Finally, we merge the LoRA networks to form an efficient model ensemble. Also, we propose a novel objective function that balances the pixel and semantic artifacts effectively. Extensive experiments on the UnivFD and Fake2M datasets demonstrate the effectiveness of our approach, achieving state-of-the-art performance. Moreover, our work paves a new way to establish generalizable domain-specific fake image detectors based on pretrained large vision models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06044v1-abstract-full').style.display = 'none'; document.getElementById('2410.06044v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.03107">arXiv:2410.03107</a> <span> [<a href="https://arxiv.org/pdf/2410.03107">pdf</a>, <a href="https://arxiv.org/format/2410.03107">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> MBDS: A Multi-Body Dynamics Simulation Dataset for Graph Networks Simulators </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+S">Sheng Yang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+F">Fengge Wu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+J">Junsuo Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.03107v1-abstract-short" style="display: inline;"> Modeling the structure and events of the physical world constitutes a fundamental objective of neural networks. Among the diverse approaches, Graph Network Simulators (GNS) have emerged as the leading method for modeling physical phenomena, owing to their low computational cost and high accuracy. The datasets employed for training and evaluating physical simulation techniques are typically generat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03107v1-abstract-full').style.display = 'inline'; document.getElementById('2410.03107v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.03107v1-abstract-full" style="display: none;"> Modeling the structure and events of the physical world constitutes a fundamental objective of neural networks. Among the diverse approaches, Graph Network Simulators (GNS) have emerged as the leading method for modeling physical phenomena, owing to their low computational cost and high accuracy. The datasets employed for training and evaluating physical simulation techniques are typically generated by researchers themselves, often resulting in limited data volume and quality. Consequently, this poses challenges in accurately assessing the performance of these methods. In response to this, we have constructed a high-quality physical simulation dataset encompassing 1D, 2D, and 3D scenes, along with more trajectories and time-steps compared to existing datasets. Furthermore, our work distinguishes itself by developing eight complete scenes, significantly enhancing the dataset's comprehensiveness. A key feature of our dataset is the inclusion of precise multi-body dynamics, facilitating a more realistic simulation of the physical world. Utilizing our high-quality dataset, we conducted a systematic evaluation of various existing GNS methods. Our dataset is accessible for download at https://github.com/Sherlocktein/MBDS, offering a valuable resource for researchers to enhance the training and evaluation of their methodologies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03107v1-abstract-full').style.display = 'none'; document.getElementById('2410.03107v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.01188">arXiv:2410.01188</a> <span> [<a href="https://arxiv.org/pdf/2410.01188">pdf</a>, <a href="https://arxiv.org/format/2410.01188">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Gold Panning in Vocabulary: An Adaptive Method for Vocabulary Expansion of Domain-Specific LLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+C">Chengyuan Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shihang Wang</a>, <a href="/search/cs?searchtype=author&query=Qing%2C+L">Lizhi Qing</a>, <a href="/search/cs?searchtype=author&query=Kuang%2C+K">Kun Kuang</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+Y">Yangyang Kang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+C">Changlong Sun</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+F">Fei Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.01188v1-abstract-short" style="display: inline;"> While Large Language Models (LLMs) demonstrate impressive generation abilities, they frequently struggle when it comes to specialized domains due to their limited domain-specific knowledge. Studies on domain-specific LLMs resort to expanding the vocabulary before fine-tuning on domain-specific corpus, aiming to decrease the sequence length and enhance efficiency during decoding, without thoroughly… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01188v1-abstract-full').style.display = 'inline'; document.getElementById('2410.01188v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.01188v1-abstract-full" style="display: none;"> While Large Language Models (LLMs) demonstrate impressive generation abilities, they frequently struggle when it comes to specialized domains due to their limited domain-specific knowledge. Studies on domain-specific LLMs resort to expanding the vocabulary before fine-tuning on domain-specific corpus, aiming to decrease the sequence length and enhance efficiency during decoding, without thoroughly investigating the results of vocabulary expansion to LLMs over different domains. Our pilot study reveals that expansion with only a subset of the entire vocabulary may lead to superior performance. Guided by the discovery, this paper explores how to identify a vocabulary subset to achieve the optimal results. We introduce VEGAD, an adaptive method that automatically identifies valuable words from a given domain vocabulary. Our method has been validated through experiments on three Chinese datasets, demonstrating its effectiveness. Additionally, we have undertaken comprehensive analyses of the method. The selection of a optimal subset for expansion has shown to enhance performance on both domain-specific tasks and general tasks, showcasing the potential of VEGAD. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01188v1-abstract-full').style.display = 'none'; document.getElementById('2410.01188v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by EMNLP 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.19091">arXiv:2409.19091</a> <span> [<a href="https://arxiv.org/pdf/2409.19091">pdf</a>, <a href="https://arxiv.org/format/2409.19091">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> System-Level Defense against Indirect Prompt Injection Attacks: An Information Flow Control Perspective </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+F">Fangzhou Wu</a>, <a href="/search/cs?searchtype=author&query=Cecchetti%2C+E">Ethan Cecchetti</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+C">Chaowei Xiao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.19091v2-abstract-short" style="display: inline;"> Large Language Model-based systems (LLM systems) are information and query processing systems that use LLMs to plan operations from natural-language prompts and feed the output of each successive step into the LLM to plan the next. This structure results in powerful tools that can process complex information from diverse sources but raises critical security concerns. Malicious information from any… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19091v2-abstract-full').style.display = 'inline'; document.getElementById('2409.19091v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.19091v2-abstract-full" style="display: none;"> Large Language Model-based systems (LLM systems) are information and query processing systems that use LLMs to plan operations from natural-language prompts and feed the output of each successive step into the LLM to plan the next. This structure results in powerful tools that can process complex information from diverse sources but raises critical security concerns. Malicious information from any source may be processed by the LLM and can compromise the query processing, resulting in nearly arbitrary misbehavior. To tackle this problem, we present a system-level defense based on the principles of information flow control that we call an f-secure LLM system. An f-secure LLM system disaggregates the components of an LLM system into a context-aware pipeline with dynamically generated structured executable plans, and a security monitor filters out untrusted input into the planning process. This structure prevents compromise while maximizing flexibility. We provide formal models for both existing LLM systems and our f-secure LLM system, allowing analysis of critical security guarantees. We further evaluate case studies and benchmarks showing that f-secure LLM systems provide robust security while preserving functionality and efficiency. Our code is released at https://github.com/fzwark/Secure_LLM_System. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19091v2-abstract-full').style.display = 'none'; document.getElementById('2409.19091v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">23 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.18839">arXiv:2409.18839</a> <span> [<a href="https://arxiv.org/pdf/2409.18839">pdf</a>, <a href="https://arxiv.org/format/2409.18839">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MinerU: An Open-Source Solution for Precise Document Content Extraction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bin Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+C">Chao Xu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+X">Xiaomeng Zhao</a>, <a href="/search/cs?searchtype=author&query=Ouyang%2C+L">Linke Ouyang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+F">Fan Wu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Zhiyuan Zhao</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+R">Rui Xu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+K">Kaiwen Liu</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+Y">Yuan Qu</a>, <a href="/search/cs?searchtype=author&query=Shang%2C+F">Fukai Shang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Bo Zhang</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+L">Liqun Wei</a>, <a href="/search/cs?searchtype=author&query=Sui%2C+Z">Zhihao Sui</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wei Li</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+B">Botian Shi</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+Y">Yu Qiao</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+D">Dahua Lin</a>, <a href="/search/cs?searchtype=author&query=He%2C+C">Conghui He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.18839v1-abstract-short" style="display: inline;"> Document content analysis has been a crucial research area in computer vision. Despite significant advancements in methods such as OCR, layout detection, and formula recognition, existing open-source solutions struggle to consistently deliver high-quality content extraction due to the diversity in document types and content. To address these challenges, we present MinerU, an open-source solution f… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.18839v1-abstract-full').style.display = 'inline'; document.getElementById('2409.18839v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.18839v1-abstract-full" style="display: none;"> Document content analysis has been a crucial research area in computer vision. Despite significant advancements in methods such as OCR, layout detection, and formula recognition, existing open-source solutions struggle to consistently deliver high-quality content extraction due to the diversity in document types and content. To address these challenges, we present MinerU, an open-source solution for high-precision document content extraction. MinerU leverages the sophisticated PDF-Extract-Kit models to extract content from diverse documents effectively and employs finely-tuned preprocessing and postprocessing rules to ensure the accuracy of the final results. Experimental results demonstrate that MinerU consistently achieves high performance across various document types, significantly enhancing the quality and consistency of content extraction. The MinerU open-source project is available at https://github.com/opendatalab/MinerU. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.18839v1-abstract-full').style.display = 'none'; document.getElementById('2409.18839v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">MinerU Technical Report</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.16167">arXiv:2409.16167</a> <span> [<a href="https://arxiv.org/pdf/2409.16167">pdf</a>, <a href="https://arxiv.org/format/2409.16167">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Merging LoRAs like Playing LEGO: Pushing the Modularity of LoRA to Extremes Through Rank-Wise Clustering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Ziyu Zhao</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+T">Tao Shen</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+D">Didi Zhu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zexi Li</a>, <a href="/search/cs?searchtype=author&query=Su%2C+J">Jing Su</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xuwu Wang</a>, <a href="/search/cs?searchtype=author&query=Kuang%2C+K">Kun Kuang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+F">Fei Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.16167v3-abstract-short" style="display: inline;"> Low-Rank Adaptation (LoRA) has emerged as a popular technique for fine-tuning large language models (LLMs) to various domains due to its modular design and widespread availability on platforms like Huggingface. This modularity has sparked interest in combining multiple LoRAs to enhance LLM capabilities. However, existing methods for LoRA composition primarily focus on task-specific adaptations tha… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.16167v3-abstract-full').style.display = 'inline'; document.getElementById('2409.16167v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.16167v3-abstract-full" style="display: none;"> Low-Rank Adaptation (LoRA) has emerged as a popular technique for fine-tuning large language models (LLMs) to various domains due to its modular design and widespread availability on platforms like Huggingface. This modularity has sparked interest in combining multiple LoRAs to enhance LLM capabilities. However, existing methods for LoRA composition primarily focus on task-specific adaptations that require additional training, and current model merging techniques often fail to fully leverage LoRA's modular nature, leading to parameter interference and performance degradation. In this paper, we investigate the feasibility of disassembling and reassembling multiple LoRAs at a finer granularity, analogous to assembling LEGO blocks. We introduce the concept of Minimal Semantic Units (MSUs), where the parameters corresponding to each rank in LoRA function as independent units. These MSUs demonstrate permutation invariance and concatenation-summation equivalence properties, enabling flexible combinations to create new LoRAs. Building on these insights, we propose the LoRA-LEGO framework. This framework conducts rank-wise parameter clustering by grouping MSUs from different LoRAs into $k$ clusters. The centroid of each cluster serves as a representative MSU, enabling the assembly of a merged LoRA with an adjusted rank of $k$. Additionally, we apply a dual reweighting strategy to optimize the scale of the merged LoRA. Experiments across various benchmarks demonstrate that our method outperforms existing approaches in LoRA merging. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.16167v3-abstract-full').style.display = 'none'; document.getElementById('2409.16167v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.13331">arXiv:2409.13331</a> <span> [<a href="https://arxiv.org/pdf/2409.13331">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Applying Pre-trained Multilingual BERT in Embeddings for Improved Malicious Prompt Injection Attacks Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Rahman%2C+M+A">Md Abdur Rahman</a>, <a href="/search/cs?searchtype=author&query=Shahriar%2C+H">Hossain Shahriar</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+F">Fan Wu</a>, <a href="/search/cs?searchtype=author&query=Cuzzocrea%2C+A">Alfredo Cuzzocrea</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.13331v1-abstract-short" style="display: inline;"> Large language models (LLMs) are renowned for their exceptional capabilities, and applying to a wide range of applications. However, this widespread use brings significant vulnerabilities. Also, it is well observed that there are huge gap which lies in the need for effective detection and mitigation strategies against malicious prompt injection attacks in large language models, as current approach… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13331v1-abstract-full').style.display = 'inline'; document.getElementById('2409.13331v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.13331v1-abstract-full" style="display: none;"> Large language models (LLMs) are renowned for their exceptional capabilities, and applying to a wide range of applications. However, this widespread use brings significant vulnerabilities. Also, it is well observed that there are huge gap which lies in the need for effective detection and mitigation strategies against malicious prompt injection attacks in large language models, as current approaches may not adequately address the complexity and evolving nature of these vulnerabilities in real-world applications. Therefore, this work focuses the impact of malicious prompt injection attacks which is one of most dangerous vulnerability on real LLMs applications. It examines to apply various BERT (Bidirectional Encoder Representations from Transformers) like multilingual BERT, DistilBert for classifying malicious prompts from legitimate prompts. Also, we observed how tokenizing the prompt texts and generating embeddings using multilingual BERT contributes to improve the performance of various machine learning methods: Gaussian Naive Bayes, Random Forest, Support Vector Machine, and Logistic Regression. The performance of each model is rigorously analyzed with various parameters to improve the binary classification to discover malicious prompts. Multilingual BERT approach to embed the prompts significantly improved and outperformed the existing works and achieves an outstanding accuracy of 96.55% by Logistic regression. Additionally, we investigated the incorrect predictions of the model to gain insights into its limitations. The findings can guide researchers in tuning various BERT for finding the most suitable model for diverse LLMs vulnerabilities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13331v1-abstract-full').style.display = 'none'; document.getElementById('2409.13331v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.11863">arXiv:2409.11863</a> <span> [<a href="https://arxiv.org/pdf/2409.11863">pdf</a>, <a href="https://arxiv.org/format/2409.11863">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Learning Task Planning from Multi-Modal Demonstration for Multi-Stage Contact-Rich Manipulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+K">Kejia Chen</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+Z">Zheng Shen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yue Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Lingyun Chen</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+F">Fan Wu</a>, <a href="/search/cs?searchtype=author&query=Bing%2C+Z">Zhenshan Bing</a>, <a href="/search/cs?searchtype=author&query=Haddadin%2C+S">Sami Haddadin</a>, <a href="/search/cs?searchtype=author&query=Knoll%2C+A">Alois Knoll</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.11863v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) have gained popularity in task planning for long-horizon manipulation tasks. To enhance the validity of LLM-generated plans, visual demonstrations and online videos have been widely employed to guide the planning process. However, for manipulation tasks involving subtle movements but rich contact interactions, visual perception alone may be insufficient for the LLM to… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.11863v1-abstract-full').style.display = 'inline'; document.getElementById('2409.11863v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.11863v1-abstract-full" style="display: none;"> Large Language Models (LLMs) have gained popularity in task planning for long-horizon manipulation tasks. To enhance the validity of LLM-generated plans, visual demonstrations and online videos have been widely employed to guide the planning process. However, for manipulation tasks involving subtle movements but rich contact interactions, visual perception alone may be insufficient for the LLM to fully interpret the demonstration. Additionally, visual data provides limited information on force-related parameters and conditions, which are crucial for effective execution on real robots. In this paper, we introduce an in-context learning framework that incorporates tactile and force-torque information from human demonstrations to enhance LLMs' ability to generate plans for new task scenarios. We propose a bootstrapped reasoning pipeline that sequentially integrates each modality into a comprehensive task plan. This task plan is then used as a reference for planning in new task configurations. Real-world experiments on two different sequential manipulation tasks demonstrate the effectiveness of our framework in improving LLMs' understanding of multi-modal demonstrations and enhancing the overall planning performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.11863v1-abstract-full').style.display = 'none'; document.getElementById('2409.11863v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.11234">arXiv:2409.11234</a> <span> [<a href="https://arxiv.org/pdf/2409.11234">pdf</a>, <a href="https://arxiv.org/format/2409.11234">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> STCMOT: Spatio-Temporal Cohesion Learning for UAV-Based Multiple Object Tracking </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ma%2C+J">Jianbo Ma</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+C">Chuanming Tang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+F">Fei Wu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+C">Can Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jianlin Zhang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zhiyong Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.11234v1-abstract-short" style="display: inline;"> Multiple object tracking (MOT) in Unmanned Aerial Vehicle (UAV) videos is important for diverse applications in computer vision. Current MOT trackers rely on accurate object detection results and precise matching of target reidentification (ReID). These methods focus on optimizing target spatial attributes while overlooking temporal cues in modelling object relationships, especially for challengin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.11234v1-abstract-full').style.display = 'inline'; document.getElementById('2409.11234v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.11234v1-abstract-full" style="display: none;"> Multiple object tracking (MOT) in Unmanned Aerial Vehicle (UAV) videos is important for diverse applications in computer vision. Current MOT trackers rely on accurate object detection results and precise matching of target reidentification (ReID). These methods focus on optimizing target spatial attributes while overlooking temporal cues in modelling object relationships, especially for challenging tracking conditions such as object deformation and blurring, etc. To address the above-mentioned issues, we propose a novel Spatio-Temporal Cohesion Multiple Object Tracking framework (STCMOT), which utilizes historical embedding features to model the representation of ReID and detection features in a sequential order. Concretely, a temporal embedding boosting module is introduced to enhance the discriminability of individual embedding based on adjacent frame cooperation. While the trajectory embedding is then propagated by a temporal detection refinement module to mine salient target locations in the temporal field. Extensive experiments on the VisDrone2019 and UAVDT datasets demonstrate our STCMOT sets a new state-of-the-art performance in MOTA and IDF1 metrics. The source codes are released at https://github.com/ydhcg-BoBo/STCMOT. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.11234v1-abstract-full').style.display = 'none'; document.getElementById('2409.11234v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.11047">arXiv:2409.11047</a> <span> [<a href="https://arxiv.org/pdf/2409.11047">pdf</a>, <a href="https://arxiv.org/format/2409.11047">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> TacDiffusion: Force-domain Diffusion Policy for Precise Tactile Manipulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yansong Wu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zongxie Chen</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+F">Fan Wu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Lingyun Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Liding Zhang</a>, <a href="/search/cs?searchtype=author&query=Bing%2C+Z">Zhenshan Bing</a>, <a href="/search/cs?searchtype=author&query=Swikir%2C+A">Abdalla Swikir</a>, <a href="/search/cs?searchtype=author&query=Knoll%2C+A">Alois Knoll</a>, <a href="/search/cs?searchtype=author&query=Haddadin%2C+S">Sami Haddadin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.11047v1-abstract-short" style="display: inline;"> Assembly is a crucial skill for robots in both modern manufacturing and service robotics. However, mastering transferable insertion skills that can handle a variety of high-precision assembly tasks remains a significant challenge. This paper presents a novel framework that utilizes diffusion models to generate 6D wrench for high-precision tactile robotic insertion tasks. It learns from demonstrati… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.11047v1-abstract-full').style.display = 'inline'; document.getElementById('2409.11047v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.11047v1-abstract-full" style="display: none;"> Assembly is a crucial skill for robots in both modern manufacturing and service robotics. However, mastering transferable insertion skills that can handle a variety of high-precision assembly tasks remains a significant challenge. This paper presents a novel framework that utilizes diffusion models to generate 6D wrench for high-precision tactile robotic insertion tasks. It learns from demonstrations performed on a single task and achieves a zero-shot transfer success rate of 95.7% across various novel high-precision tasks. Our method effectively inherits the self-adaptability demonstrated by our previous work. In this framework, we address the frequency misalignment between the diffusion policy and the real-time control loop with a dynamic system-based filter, significantly improving the task success rate by 9.15%. Furthermore, we provide a practical guideline regarding the trade-off between diffusion models' inference ability and speed. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.11047v1-abstract-full').style.display = 'none'; document.getElementById('2409.11047v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.10444">arXiv:2409.10444</a> <span> [<a href="https://arxiv.org/pdf/2409.10444">pdf</a>, <a href="https://arxiv.org/format/2409.10444">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> LLM as BT-Planner: Leveraging LLMs for Behavior Tree Generation in Robot Task Planning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ao%2C+J">Jicong Ao</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+F">Fan Wu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yansong Wu</a>, <a href="/search/cs?searchtype=author&query=Swikir%2C+A">Abdalla Swikir</a>, <a href="/search/cs?searchtype=author&query=Haddadin%2C+S">Sami Haddadin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.10444v1-abstract-short" style="display: inline;"> Robotic assembly tasks are open challenges due to the long task horizon and complex part relations. Behavior trees (BTs) are increasingly used in robot task planning for their modularity and flexibility, but manually designing them can be effort-intensive. Large language models (LLMs) have recently been applied in robotic task planning for generating action sequences, but their ability to generate… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.10444v1-abstract-full').style.display = 'inline'; document.getElementById('2409.10444v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.10444v1-abstract-full" style="display: none;"> Robotic assembly tasks are open challenges due to the long task horizon and complex part relations. Behavior trees (BTs) are increasingly used in robot task planning for their modularity and flexibility, but manually designing them can be effort-intensive. Large language models (LLMs) have recently been applied in robotic task planning for generating action sequences, but their ability to generate BTs has not been fully investigated. To this end, We propose LLM as BT-planner, a novel framework to leverage LLMs for BT generation in robotic assembly task planning and execution. Four in-context learning methods are introduced to utilize the natural language processing and inference capabilities of LLMs to produce task plans in BT format, reducing manual effort and ensuring robustness and comprehensibility. We also evaluate the performance of fine-tuned, fewer-parameter LLMs on the same tasks. Experiments in simulated and real-world settings show that our framework enhances LLMs' performance in BT generation, improving success rates in BT generation through in-context learning and supervised fine-tuning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.10444v1-abstract-full').style.display = 'none'; document.getElementById('2409.10444v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.09435">arXiv:2409.09435</a> <span> [<a href="https://arxiv.org/pdf/2409.09435">pdf</a>, <a href="https://arxiv.org/format/2409.09435">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Behavior Tree Generation using Large Language Models for Sequential Manipulation Planning with Human Instructions and Feedback </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ao%2C+J">Jicong Ao</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yansong Wu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+F">Fan Wu</a>, <a href="/search/cs?searchtype=author&query=Haddadin%2C+S">Sami Haddadin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.09435v1-abstract-short" style="display: inline;"> In this work, we propose an LLM-based BT generation framework to leverage the strengths of both for sequential manipulation planning. To enable human-robot collaborative task planning and enhance intuitive robot programming by nonexperts, the framework takes human instructions to initiate the generation of action sequences and human feedback to refine BT generation in runtime. All presented method… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09435v1-abstract-full').style.display = 'inline'; document.getElementById('2409.09435v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.09435v1-abstract-full" style="display: none;"> In this work, we propose an LLM-based BT generation framework to leverage the strengths of both for sequential manipulation planning. To enable human-robot collaborative task planning and enhance intuitive robot programming by nonexperts, the framework takes human instructions to initiate the generation of action sequences and human feedback to refine BT generation in runtime. All presented methods within the framework are tested on a real robotic assembly example, which uses a gear set model from the Siemens Robot Assembly Challenge. We use a single manipulator with a tool-changing mechanism, a common practice in flexible manufacturing, to facilitate robust grasping of a large variety of objects. Experimental results are evaluated regarding success rate, logical coherence, executability, time consumption, and token consumption. To our knowledge, this is the first human-guided LLM-based BT generation framework that unifies various plausible ways of using LLMs to fully generate BTs that are executable on the real testbed and take into account granular knowledge of tool use. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09435v1-abstract-full').style.display = 'none'; document.getElementById('2409.09435v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> ICRA 2024 Workshop Exploring Role Allocation in Human-Robot Co-Manipulation </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.09039">arXiv:2409.09039</a> <span> [<a href="https://arxiv.org/pdf/2409.09039">pdf</a>, <a href="https://arxiv.org/format/2409.09039">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> AutoGeo: Automating Geometric Image Dataset Creation for Enhanced Geometry Understanding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+Z">Zihan Huang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+T">Tao Wu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+W">Wang Lin</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shengyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jingyuan Chen</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+F">Fei Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.09039v1-abstract-short" style="display: inline;"> With the rapid advancement of large language models, there has been a growing interest in their capabilities in mathematical reasoning. However, existing research has primarily focused on text-based algebra problems, neglecting the study of geometry due to the lack of high-quality geometric datasets. To address this gap, this paper introduces AutoGeo, a novel approach for automatically generating… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09039v1-abstract-full').style.display = 'inline'; document.getElementById('2409.09039v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.09039v1-abstract-full" style="display: none;"> With the rapid advancement of large language models, there has been a growing interest in their capabilities in mathematical reasoning. However, existing research has primarily focused on text-based algebra problems, neglecting the study of geometry due to the lack of high-quality geometric datasets. To address this gap, this paper introduces AutoGeo, a novel approach for automatically generating mathematical geometric images to fulfill the demand for large-scale and diverse geometric datasets. AutoGeo facilitates the creation of AutoGeo-100k, an extensive repository comprising 100k high-quality geometry image-text pairs. By leveraging precisely defined geometric clauses, AutoGeo-100k contains a wide variety of geometric shapes, including lines, polygons, circles, and complex spatial relationships, etc. Furthermore, this paper demonstrates the efficacy of AutoGeo-100k in enhancing the performance of multimodal large language models through fine-tuning. Experimental results indicate significant improvements in the model's ability in handling geometric images, as evidenced by enhanced accuracy in tasks such as geometric captioning and mathematical reasoning. This research not only fills a critical gap in the availability of geometric datasets but also paves the way for the advancement of sophisticated AI-driven tools in education and research. Project page: https://autogeo-official.github.io/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09039v1-abstract-full').style.display = 'none'; document.getElementById('2409.09039v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.08580">arXiv:2409.08580</a> <span> [<a href="https://arxiv.org/pdf/2409.08580">pdf</a>, <a href="https://arxiv.org/format/2409.08580">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1007/978-3-031-70352-2_21">10.1007/978-3-031-70352-2_21 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Molecular Graph Representation Learning via Structural Similarity Information </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yao%2C+C">Chengyu Yao</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+H">Hong Huang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+H">Hang Gao</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+F">Fengge Wu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Haiming Chen</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+J">Junsuo Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.08580v1-abstract-short" style="display: inline;"> Graph Neural Networks (GNNs) have been widely employed for feature representation learning in molecular graphs. Therefore, it is crucial to enhance the expressiveness of feature representation to ensure the effectiveness of GNNs. However, a significant portion of current research primarily focuses on the structural features within individual molecules, often overlooking the structural similarity b… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08580v1-abstract-full').style.display = 'inline'; document.getElementById('2409.08580v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.08580v1-abstract-full" style="display: none;"> Graph Neural Networks (GNNs) have been widely employed for feature representation learning in molecular graphs. Therefore, it is crucial to enhance the expressiveness of feature representation to ensure the effectiveness of GNNs. However, a significant portion of current research primarily focuses on the structural features within individual molecules, often overlooking the structural similarity between molecules, which is a crucial aspect encapsulating rich information on the relationship between molecular properties and structural characteristics. Thus, these approaches fail to capture the rich semantic information at the molecular structure level. To bridge this gap, we introduce the \textbf{Molecular Structural Similarity Motif GNN (MSSM-GNN)}, a novel molecular graph representation learning method that can capture structural similarity information among molecules from a global perspective. In particular, we propose a specially designed graph that leverages graph kernel algorithms to represent the similarity between molecules quantitatively. Subsequently, we employ GNNs to learn feature representations from molecular graphs, aiming to enhance the accuracy of property prediction by incorporating additional molecular representation information. Finally, through a series of experiments conducted on both small-scale and large-scale molecular datasets, we demonstrate that our model consistently outperforms eleven state-of-the-art baselines. The codes are available at https://github.com/yaoyao-yaoyao-cell/MSSM-GNN. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08580v1-abstract-full').style.display = 'none'; document.getElementById('2409.08580v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Machine Learning and Knowledge Discovery in Databases. Research Track. ECML PKDD 2024. Lecture Notes in Computer Science(), vol 14943. Springer, Cham </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.08562">arXiv:2409.08562</a> <span> [<a href="https://arxiv.org/pdf/2409.08562">pdf</a>, <a href="https://arxiv.org/format/2409.08562">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> CSS: Overcoming Pose and Scene Challenges in Crowd-Sourced 3D Gaussian Splatting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+R">Runze Chen</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+M">Mingyu Xiao</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+H">Haiyong Luo</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+F">Fang Zhao</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+F">Fan Wu</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+H">Hao Xiong</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qi Liu</a>, <a href="/search/cs?searchtype=author&query=Song%2C+M">Meng Song</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.08562v1-abstract-short" style="display: inline;"> We introduce Crowd-Sourced Splatting (CSS), a novel 3D Gaussian Splatting (3DGS) pipeline designed to overcome the challenges of pose-free scene reconstruction using crowd-sourced imagery. The dream of reconstructing historically significant but inaccessible scenes from collections of photographs has long captivated researchers. However, traditional 3D techniques struggle with missing camera poses… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08562v1-abstract-full').style.display = 'inline'; document.getElementById('2409.08562v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.08562v1-abstract-full" style="display: none;"> We introduce Crowd-Sourced Splatting (CSS), a novel 3D Gaussian Splatting (3DGS) pipeline designed to overcome the challenges of pose-free scene reconstruction using crowd-sourced imagery. The dream of reconstructing historically significant but inaccessible scenes from collections of photographs has long captivated researchers. However, traditional 3D techniques struggle with missing camera poses, limited viewpoints, and inconsistent lighting. CSS addresses these challenges through robust geometric priors and advanced illumination modeling, enabling high-quality novel view synthesis under complex, real-world conditions. Our method demonstrates clear improvements over existing approaches, paving the way for more accurate and flexible applications in AR, VR, and large-scale 3D reconstruction. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08562v1-abstract-full').style.display = 'none'; document.getElementById('2409.08562v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.08481">arXiv:2409.08481</a> <span> [<a href="https://arxiv.org/pdf/2409.08481">pdf</a>, <a href="https://arxiv.org/format/2409.08481">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> USTC-TD: A Test Dataset and Benchmark for Image and Video Coding in 2020s </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhuoyuan Li</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+J">Junqi Liao</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+C">Chuanbo Tang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Haotian Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yuqi Li</a>, <a href="/search/cs?searchtype=author&query=Bian%2C+Y">Yifan Bian</a>, <a href="/search/cs?searchtype=author&query=Sheng%2C+X">Xihua Sheng</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+X">Xinmin Feng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yao Li</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+C">Changsheng Gao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Li Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+D">Dong Liu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+F">Feng Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.08481v2-abstract-short" style="display: inline;"> Image/video coding has been a remarkable research area for both academia and industry for many years. Testing datasets, especially high-quality image/video datasets are desirable for the justified evaluation of coding-related research, practical applications, and standardization activities. We put forward a test dataset namely USTC-TD, which has been successfully adopted in the practical end-to-en… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08481v2-abstract-full').style.display = 'inline'; document.getElementById('2409.08481v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.08481v2-abstract-full" style="display: none;"> Image/video coding has been a remarkable research area for both academia and industry for many years. Testing datasets, especially high-quality image/video datasets are desirable for the justified evaluation of coding-related research, practical applications, and standardization activities. We put forward a test dataset namely USTC-TD, which has been successfully adopted in the practical end-to-end image/video coding challenge of the IEEE International Conference on Visual Communications and lmage Processing (VCIP) in 2022 and 2023. USTC-TD contains 40 images at 4K spatial resolution and 10 video sequences at 1080p spatial resolution, featuring various content due to the diverse environmental factors (e.g. scene type, texture, motion, view) and the designed imaging factors (e.g. illumination, lens, shadow). We quantitatively evaluate USTC-TD on different image/video features (spatial, temporal, color, lightness), and compare it with the previous image/video test datasets, which verifies the wider coverage and more diversity of the proposed dataset. We also evaluate both classic standardized and recent learned image/video coding schemes on USTC-TD with PSNR and MS-SSIM, and provide an extensive benchmark for the evaluated schemes. Based on the characteristics and specific design of the proposed test dataset, we analyze the benchmark performance and shed light on the future research and development of image/video coding. All the data are released online: https://esakak.github.io/USTC-TD . <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08481v2-abstract-full').style.display = 'none'; document.getElementById('2409.08481v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">23 pages. Project Page: https://esakak.github.io/USTC-TD</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Wu%2C+F&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Wu%2C+F&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Wu%2C+F&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Wu%2C+F&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Wu%2C+F&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Wu%2C+F&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>