Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 334 results for author: <span class="mathjax">Jiang, B</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Jiang%2C+B">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Jiang, B"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Jiang%2C+B&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Jiang, B"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Jiang%2C+B&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Jiang%2C+B&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Jiang%2C+B&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Jiang%2C+B&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Jiang%2C+B&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Jiang%2C+B&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li> <a href="/search/?searchtype=author&query=Jiang%2C+B&start=250" class="pagination-link " aria-label="Page 6" aria-current="page">6 </a> </li> <li> <a href="/search/?searchtype=author&query=Jiang%2C+B&start=300" class="pagination-link " aria-label="Page 7" aria-current="page">7 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10499">arXiv:2411.10499</a> <span> [<a href="https://arxiv.org/pdf/2411.10499">pdf</a>, <a href="https://arxiv.org/format/2411.10499">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> FitDiT: Advancing the Authentic Garment Details for High-fidelity Virtual Try-on </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+B">Boyuan Jiang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+X">Xiaobin Hu</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+D">Donghao Luo</a>, <a href="/search/cs?searchtype=author&query=He%2C+Q">Qingdong He</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+C">Chengming Xu</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+J">Jinlong Peng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiangning Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chengjie Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yunsheng Wu</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+Y">Yanwei Fu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10499v1-abstract-short" style="display: inline;"> Although image-based virtual try-on has made considerable progress, emerging approaches still encounter challenges in producing high-fidelity and robust fitting images across diverse scenarios. These methods often struggle with issues such as texture-aware maintenance and size-aware fitting, which hinder their overall effectiveness. To address these limitations, we propose a novel garment percepti… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10499v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10499v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10499v1-abstract-full" style="display: none;"> Although image-based virtual try-on has made considerable progress, emerging approaches still encounter challenges in producing high-fidelity and robust fitting images across diverse scenarios. These methods often struggle with issues such as texture-aware maintenance and size-aware fitting, which hinder their overall effectiveness. To address these limitations, we propose a novel garment perception enhancement technique, termed FitDiT, designed for high-fidelity virtual try-on using Diffusion Transformers (DiT) allocating more parameters and attention to high-resolution features. First, to further improve texture-aware maintenance, we introduce a garment texture extractor that incorporates garment priors evolution to fine-tune garment feature, facilitating to better capture rich details such as stripes, patterns, and text. Additionally, we introduce frequency-domain learning by customizing a frequency distance loss to enhance high-frequency garment details. To tackle the size-aware fitting issue, we employ a dilated-relaxed mask strategy that adapts to the correct length of garments, preventing the generation of garments that fill the entire mask area during cross-category try-on. Equipped with the above design, FitDiT surpasses all baselines in both qualitative and quantitative evaluations. It excels in producing well-fitting garments with photorealistic and intricate details, while also achieving competitive inference times of 4.57 seconds for a single 1024x768 image after DiT structure slimming, outperforming existing methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10499v1-abstract-full').style.display = 'none'; document.getElementById('2411.10499v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project link: https://byjiang.com/FitDiT/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09691">arXiv:2411.09691</a> <span> [<a href="https://arxiv.org/pdf/2411.09691">pdf</a>, <a href="https://arxiv.org/format/2411.09691">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Advancing Fine-Grained Visual Understanding with Multi-Scale Alignment in Multi-Modal Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wei Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhaowei Li</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Q">Qi Xu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Linfeng Li</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+Y">YiQing Cai</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+B">Botian Jiang</a>, <a href="/search/cs?searchtype=author&query=Song%2C+H">Hang Song</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+X">Xingcan Hu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+P">Pengyu Wang</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+L">Li Xiao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09691v1-abstract-short" style="display: inline;"> Multi-modal large language models (MLLMs) have achieved remarkable success in fine-grained visual understanding across a range of tasks. However, they often encounter significant challenges due to inadequate alignment for fine-grained knowledge, which restricts their ability to accurately capture local details and attain a comprehensive global perception. While recent advancements have focused on… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09691v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09691v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09691v1-abstract-full" style="display: none;"> Multi-modal large language models (MLLMs) have achieved remarkable success in fine-grained visual understanding across a range of tasks. However, they often encounter significant challenges due to inadequate alignment for fine-grained knowledge, which restricts their ability to accurately capture local details and attain a comprehensive global perception. While recent advancements have focused on aligning object expressions with grounding information, they typically lack explicit integration of object images, which contain affluent information beyond mere texts or coordinates. To bridge this gap, we introduce a novel fine-grained visual knowledge alignment method that effectively aligns and integrates multi-scale knowledge of objects, including texts, coordinates, and images. This innovative method is underpinned by our multi-scale fine-grained enhancement data synthesis pipeline, which provides over 300K essential training data to enhance alignment and improve overall performance. Furthermore, we present TinyGroundingGPT, a series of compact models optimized for high-level alignments. With a scale of approximately 3B parameters, TinyGroundingGPT achieves outstanding results in grounding tasks while delivering performance comparable to larger MLLMs in complex visual scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09691v1-abstract-full').style.display = 'none'; document.getElementById('2411.09691v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02775">arXiv:2411.02775</a> <span> [<a href="https://arxiv.org/pdf/2411.02775">pdf</a>, <a href="https://arxiv.org/format/2411.02775">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Winemaking: Extracting Essential Insights for Efficient Threat Detection in Audit Logs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+W">Weiheng Wu</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+W">Wei Qiao</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+W">Wenhao Yan</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+B">Bo Jiang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yuling Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+B">Baoxu Liu</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Z">Zhigang Lu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">JunRong Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02775v2-abstract-short" style="display: inline;"> Advanced Persistent Threats (APTs) are continuously evolving, leveraging their stealthiness and persistence to put increasing pressure on current provenance-based Intrusion Detection Systems (IDS). This evolution exposes several critical issues: (1) The dense interaction between malicious and benign nodes within provenance graphs introduces neighbor noise, hindering effective detection; (2) The co… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02775v2-abstract-full').style.display = 'inline'; document.getElementById('2411.02775v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02775v2-abstract-full" style="display: none;"> Advanced Persistent Threats (APTs) are continuously evolving, leveraging their stealthiness and persistence to put increasing pressure on current provenance-based Intrusion Detection Systems (IDS). This evolution exposes several critical issues: (1) The dense interaction between malicious and benign nodes within provenance graphs introduces neighbor noise, hindering effective detection; (2) The complex prediction mechanisms of existing APTs detection models lead to the insufficient utilization of prior knowledge embedded in the data; (3) The high computational cost makes detection impractical. To address these challenges, we propose Winemaking, a lightweight threat detection system built on a knowledge distillation framework, capable of node-level detection within audit log provenance graphs. Specifically, Winemaking applies graph Laplacian regularization to reduce neighbor noise, obtaining smoothed and denoised graph signals. Subsequently, Winemaking employs a teacher model based on GNNs to extract knowledge, which is then distilled into a lightweight student model. The student model is designed as a trainable combination of a feature transformation module and a personalized PageRank random walk label propagation module, with the former capturing feature knowledge and the latter learning label and structural knowledge. After distillation, the student model benefits from the knowledge of the teacher model to perform precise threat detection. We evaluate Winemaking through extensive experiments on three public datasets and compare its performance against several state-of-the-art IDS solutions. The results demonstrate that Winemaking achieves outstanding detection accuracy across all scenarios and the detection time is 1.4 to 5.2 times faster than the current state-of-the-art methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02775v2-abstract-full').style.display = 'none'; document.getElementById('2411.02775v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages body, 11 pages total(without authors)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02442">arXiv:2411.02442</a> <span> [<a href="https://arxiv.org/pdf/2411.02442">pdf</a>, <a href="https://arxiv.org/format/2411.02442">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> TODO: Enhancing LLM Alignment with Ternary Preferences </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+Y">Yuxiang Guo</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+L">Lu Yin</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+B">Bo Jiang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiaqi Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02442v1-abstract-short" style="display: inline;"> Aligning large language models (LLMs) with human intent is critical for enhancing their performance across a variety of tasks. Standard alignment techniques, such as Direct Preference Optimization (DPO), often rely on the binary Bradley-Terry (BT) model, which can struggle to capture the complexities of human preferences -- particularly in the presence of noisy or inconsistent labels and frequent… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02442v1-abstract-full').style.display = 'inline'; document.getElementById('2411.02442v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02442v1-abstract-full" style="display: none;"> Aligning large language models (LLMs) with human intent is critical for enhancing their performance across a variety of tasks. Standard alignment techniques, such as Direct Preference Optimization (DPO), often rely on the binary Bradley-Terry (BT) model, which can struggle to capture the complexities of human preferences -- particularly in the presence of noisy or inconsistent labels and frequent ties. To address these limitations, we introduce the Tie-rank Oriented Bradley-Terry model (TOBT), an extension of the BT model that explicitly incorporates ties, enabling more nuanced preference representation. Building on this, we propose Tie-rank Oriented Direct Preference Optimization (TODO), a novel alignment algorithm that leverages TOBT's ternary ranking system to improve preference alignment. In evaluations on Mistral-7B and Llama 3-8B models, TODO consistently outperforms DPO in modeling preferences across both in-distribution and out-of-distribution datasets. Additional assessments using MT Bench and benchmarks such as Piqa, ARC-c, and MMLU further demonstrate TODO's superior alignment performance. Notably, TODO also shows strong results in binary preference alignment, highlighting its versatility and potential for broader integration into LLM alignment. The implementation details can be found in https://github.com/XXares/TODO. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02442v1-abstract-full').style.display = 'none'; document.getElementById('2411.02442v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.22313">arXiv:2410.22313</a> <span> [<a href="https://arxiv.org/pdf/2410.22313">pdf</a>, <a href="https://arxiv.org/format/2410.22313">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Senna: Bridging Large Vision-Language Models and End-to-End Autonomous Driving </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+B">Bo Jiang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Shaoyu Chen</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+B">Bencheng Liao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xingyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+W">Wei Yin</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Qian Zhang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+C">Chang Huang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+W">Wenyu Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xinggang Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.22313v1-abstract-short" style="display: inline;"> End-to-end autonomous driving demonstrates strong planning capabilities with large-scale data but still struggles in complex, rare scenarios due to limited commonsense. In contrast, Large Vision-Language Models (LVLMs) excel in scene understanding and reasoning. The path forward lies in merging the strengths of both approaches. Previous methods using LVLMs to predict trajectories or control signal… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22313v1-abstract-full').style.display = 'inline'; document.getElementById('2410.22313v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.22313v1-abstract-full" style="display: none;"> End-to-end autonomous driving demonstrates strong planning capabilities with large-scale data but still struggles in complex, rare scenarios due to limited commonsense. In contrast, Large Vision-Language Models (LVLMs) excel in scene understanding and reasoning. The path forward lies in merging the strengths of both approaches. Previous methods using LVLMs to predict trajectories or control signals yield suboptimal results, as LVLMs are not well-suited for precise numerical predictions. This paper presents Senna, an autonomous driving system combining an LVLM (Senna-VLM) with an end-to-end model (Senna-E2E). Senna decouples high-level planning from low-level trajectory prediction. Senna-VLM generates planning decisions in natural language, while Senna-E2E predicts precise trajectories. Senna-VLM utilizes a multi-image encoding approach and multi-view prompts for efficient scene understanding. Besides, we introduce planning-oriented QAs alongside a three-stage training strategy, which enhances Senna-VLM's planning performance while preserving commonsense. Extensive experiments on two datasets show that Senna achieves state-of-the-art planning performance. Notably, with pre-training on a large-scale dataset DriveX and fine-tuning on nuScenes, Senna significantly reduces average planning error by 27.12% and collision rate by 33.33% over model without pre-training. We believe Senna's cross-scenario generalization and transferability are essential for achieving fully autonomous driving. Code and models will be released at https://github.com/hustvl/Senna. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22313v1-abstract-full').style.display = 'none'; document.getElementById('2410.22313v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project Page: https://github.com/hustvl/Senna</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.21749">arXiv:2410.21749</a> <span> [<a href="https://arxiv.org/pdf/2410.21749">pdf</a>, <a href="https://arxiv.org/format/2410.21749">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Reliable and Compact Graph Fine-tuning via GraphSparse Prompting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+B">Bo Jiang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+H">Hao Wu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Beibei Wang</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+J">Jin Tang</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+B">Bin Luo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.21749v1-abstract-short" style="display: inline;"> Recently, graph prompt learning has garnered increasing attention in adapting pre-trained GNN models for downstream graph learning tasks. However, existing works generally conduct prompting over all graph elements (e.g., nodes, edges, node attributes, etc.), which is suboptimal and obviously redundant. To address this issue, we propose exploiting sparse representation theory for graph prompting an… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21749v1-abstract-full').style.display = 'inline'; document.getElementById('2410.21749v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.21749v1-abstract-full" style="display: none;"> Recently, graph prompt learning has garnered increasing attention in adapting pre-trained GNN models for downstream graph learning tasks. However, existing works generally conduct prompting over all graph elements (e.g., nodes, edges, node attributes, etc.), which is suboptimal and obviously redundant. To address this issue, we propose exploiting sparse representation theory for graph prompting and present Graph Sparse Prompting (GSP). GSP aims to adaptively and sparsely select the optimal elements (e.g., certain node attributes) to achieve compact prompting for downstream tasks. Specifically, we propose two kinds of GSP models, termed Graph Sparse Feature Prompting (GSFP) and Graph Sparse multi-Feature Prompting (GSmFP). Both GSFP and GSmFP provide a general scheme for tuning any specific pre-trained GNNs that can achieve attribute selection and compact prompt learning simultaneously. A simple yet effective algorithm has been designed for solving GSFP and GSmFP models. Experiments on 16 widely-used benchmark datasets validate the effectiveness and advantages of the proposed GSFPs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21749v1-abstract-full').style.display = 'none'; document.getElementById('2410.21749v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.15358">arXiv:2410.15358</a> <span> [<a href="https://arxiv.org/pdf/2410.15358">pdf</a>, <a href="https://arxiv.org/ps/2410.15358">ps</a>, <a href="https://arxiv.org/format/2410.15358">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> </div> </div> <p class="title is-5 mathjax"> A New Adaptive Balanced Augmented Lagrangian Method with Application to ISAC Beamforming Design </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jiageng Wu</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+B">Bo Jiang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xinxin Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Ya-Feng Liu</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+J">Jianhua Yuan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.15358v1-abstract-short" style="display: inline;"> In this paper, we consider a class of convex programming problems with linear equality constraints, which finds broad applications in machine learning and signal processing. We propose a new adaptive balanced augmented Lagrangian (ABAL) method for solving these problems. The proposed ABAL method adaptively selects the stepsize parameter and enjoys a low per-iteration complexity, involving only the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15358v1-abstract-full').style.display = 'inline'; document.getElementById('2410.15358v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.15358v1-abstract-full" style="display: none;"> In this paper, we consider a class of convex programming problems with linear equality constraints, which finds broad applications in machine learning and signal processing. We propose a new adaptive balanced augmented Lagrangian (ABAL) method for solving these problems. The proposed ABAL method adaptively selects the stepsize parameter and enjoys a low per-iteration complexity, involving only the computation of a proximal mapping of the objective function and the solution of a linear equation. These features make the proposed method well-suited to large-scale problems. We then custom-apply the ABAL method to solve the ISAC beamforming design problem, which is formulated as a nonlinear semidefinite program in a previous work. This customized application requires careful exploitation of the problem's special structure such as the property that all of its signal-to-interference-and-noise-ratio (SINR) constraints hold with equality at the solution and an efficient computation of the proximal mapping of the objective function. Simulation results demonstrate the efficiency of the proposed ABAL method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15358v1-abstract-full').style.display = 'none'; document.getElementById('2410.15358v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages, 1 table</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.12329">arXiv:2410.12329</a> <span> [<a href="https://arxiv.org/pdf/2410.12329">pdf</a>, <a href="https://arxiv.org/format/2410.12329">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Understanding the Role of LLMs in Multimodal Evaluation Benchmarks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+B">Botian Jiang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Lei Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiaonan Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhaowei Li</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+X">Xiachong Feng</a>, <a href="/search/cs?searchtype=author&query=Kong%2C+L">Lingpeng Kong</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qi Liu</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+X">Xipeng Qiu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.12329v1-abstract-short" style="display: inline;"> The rapid advancement of Multimodal Large Language Models (MLLMs) has been accompanied by the development of various benchmarks to evaluate their capabilities. However, the true nature of these evaluations and the extent to which they assess multimodal reasoning versus merely leveraging the underlying Large Language Model (LLM) backbone remain unclear. This paper presents a comprehensive investiga… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12329v1-abstract-full').style.display = 'inline'; document.getElementById('2410.12329v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.12329v1-abstract-full" style="display: none;"> The rapid advancement of Multimodal Large Language Models (MLLMs) has been accompanied by the development of various benchmarks to evaluate their capabilities. However, the true nature of these evaluations and the extent to which they assess multimodal reasoning versus merely leveraging the underlying Large Language Model (LLM) backbone remain unclear. This paper presents a comprehensive investigation into the role of LLM backbones in MLLM evaluation, focusing on two critical aspects: the degree to which current benchmarks truly assess multimodal reasoning and the influence of LLM prior knowledge on performance. Specifically, we introduce a modified evaluation protocol to disentangle the contributions of the LLM backbone from multimodal integration, and an automatic knowledge identification technique for diagnosing whether LLMs equip the necessary knowledge for corresponding multimodal questions. Our study encompasses four diverse MLLM benchmarks and eight state-of-the-art MLLMs. Key findings reveal that some benchmarks allow high performance even without visual inputs and up to 50\% of error rates can be attributed to insufficient world knowledge in the LLM backbone, indicating a heavy reliance on language capabilities. To address knowledge deficiencies, we propose a knowledge augmentation pipeline that achieves significant performance gains, with improvements of up to 60\% on certain datasets, resulting in a approximately 4x increase in performance. Our work provides crucial insights into the role of the LLM backbone in MLLMs, and highlights the need for more nuanced benchmarking approaches. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12329v1-abstract-full').style.display = 'none'; document.getElementById('2410.12329v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.12309">arXiv:2410.12309</a> <span> [<a href="https://arxiv.org/pdf/2410.12309">pdf</a>, <a href="https://arxiv.org/format/2410.12309">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Correction to Local Information Privacy and Its Applications to Data Aggregation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+B">Bo Jiang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Ming Li</a>, <a href="/search/cs?searchtype=author&query=Tandon%2C+R">Ravi Tandon</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.12309v1-abstract-short" style="display: inline;"> In our previous works, we defined Local Information Privacy (LIP) as a context-aware privacy notion and presented the corresponding privacy-preserving mechanism. Then we claim that the mechanism satisfies epsilon-LIP for any epsilon>0 for arbitrary Px. However, this claim is not completely correct. In this document, we provide a correction to the valid range of privacy parameters of our previously… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12309v1-abstract-full').style.display = 'inline'; document.getElementById('2410.12309v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.12309v1-abstract-full" style="display: none;"> In our previous works, we defined Local Information Privacy (LIP) as a context-aware privacy notion and presented the corresponding privacy-preserving mechanism. Then we claim that the mechanism satisfies epsilon-LIP for any epsilon>0 for arbitrary Px. However, this claim is not completely correct. In this document, we provide a correction to the valid range of privacy parameters of our previously proposed LIP mechanism. Further, we propose efficient algorithms to expand the range of valid privacy parameters. Finally, we discuss the impact of updated results on our original paper's experiments, the rationale of the proposed correction and corrected results. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12309v1-abstract-full').style.display = 'none'; document.getElementById('2410.12309v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.11182">arXiv:2410.11182</a> <span> [<a href="https://arxiv.org/pdf/2410.11182">pdf</a>, <a href="https://arxiv.org/format/2410.11182">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Archilles' Heel in Semi-open LLMs: Hiding Bottom against Recovery Attacks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+H">Hanbo Huang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yihan Li</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+B">Bowen Jiang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Lin Liu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+R">Ruoyu Sun</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhuotao Liu</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+S">Shiyu Liang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.11182v1-abstract-short" style="display: inline;"> Closed-source large language models deliver strong performance but have limited downstream customizability. Semi-open models, combining both closed-source and public layers, were introduced to improve customizability. However, parameters in the closed-source layers are found vulnerable to recovery attacks. In this paper, we explore the design of semi-open models with fewer closed-source layers, ai… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11182v1-abstract-full').style.display = 'inline'; document.getElementById('2410.11182v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.11182v1-abstract-full" style="display: none;"> Closed-source large language models deliver strong performance but have limited downstream customizability. Semi-open models, combining both closed-source and public layers, were introduced to improve customizability. However, parameters in the closed-source layers are found vulnerable to recovery attacks. In this paper, we explore the design of semi-open models with fewer closed-source layers, aiming to increase customizability while ensuring resilience to recovery attacks. We analyze the contribution of closed-source layer to the overall resilience and theoretically prove that in a deep transformer-based model, there exists a transition layer such that even small recovery errors in layers before this layer can lead to recovery failure. Building on this, we propose \textbf{SCARA}, a novel approach that keeps only a few bottom layers as closed-source. SCARA employs a fine-tuning-free metric to estimate the maximum number of layers that can be publicly accessible for customization. We apply it to five models (1.3B to 70B parameters) to construct semi-open models, validating their customizability on six downstream tasks and assessing their resilience against various recovery attacks on sixteen benchmarks. We compare SCARA to baselines and observe that it generally improves downstream customization performance and offers similar resilience with over \textbf{10} times fewer closed-source parameters. We empirically investigate the existence of transition layers, analyze the effectiveness of our scheme and finally discuss its limitations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11182v1-abstract-full').style.display = 'none'; document.getElementById('2410.11182v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages for main content of the paper</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.08260">arXiv:2410.08260</a> <span> [<a href="https://arxiv.org/pdf/2410.08260">pdf</a>, <a href="https://arxiv.org/format/2410.08260">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Koala-36M: A Large-scale Video Dataset Improving Consistency between Fine-grained Conditions and Video Content </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qiuheng Wang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yukai Shi</a>, <a href="/search/cs?searchtype=author&query=Ou%2C+J">Jiarong Ou</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+R">Rui Chen</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+K">Ke Lin</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jiahao Wang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+B">Boyuan Jiang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+H">Haotian Yang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+M">Mingwu Zheng</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+X">Xin Tao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+F">Fei Yang</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+P">Pengfei Wan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+D">Di Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.08260v1-abstract-short" style="display: inline;"> As visual generation technologies continue to advance, the scale of video datasets has expanded rapidly, and the quality of these datasets is critical to the performance of video generation models. We argue that temporal splitting, detailed captions, and video quality filtering are three key factors that determine dataset quality. However, existing datasets exhibit various limitations in these are… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08260v1-abstract-full').style.display = 'inline'; document.getElementById('2410.08260v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.08260v1-abstract-full" style="display: none;"> As visual generation technologies continue to advance, the scale of video datasets has expanded rapidly, and the quality of these datasets is critical to the performance of video generation models. We argue that temporal splitting, detailed captions, and video quality filtering are three key factors that determine dataset quality. However, existing datasets exhibit various limitations in these areas. To address these challenges, we introduce Koala-36M, a large-scale, high-quality video dataset featuring accurate temporal splitting, detailed captions, and superior video quality. The core of our approach lies in improving the consistency between fine-grained conditions and video content. Specifically, we employ a linear classifier on probability distributions to enhance the accuracy of transition detection, ensuring better temporal consistency. We then provide structured captions for the splitted videos, with an average length of 200 words, to improve text-video alignment. Additionally, we develop a Video Training Suitability Score (VTSS) that integrates multiple sub-metrics, allowing us to filter high-quality videos from the original corpus. Finally, we incorporate several metrics into the training process of the generation model, further refining the fine-grained conditions. Our experiments demonstrate the effectiveness of our data processing pipeline and the quality of the proposed Koala-36M dataset. Our dataset and code will be released at https://koala36m.github.io/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08260v1-abstract-full').style.display = 'none'; document.getElementById('2410.08260v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://koala36m.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.07854">arXiv:2410.07854</a> <span> [<a href="https://arxiv.org/pdf/2410.07854">pdf</a>, <a href="https://arxiv.org/format/2410.07854">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> HeGraphAdapter: Tuning Multi-Modal Vision-Language Models with Heterogeneous Graph Adapter </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yumiao Zhao</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+B">Bo Jiang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiao Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Q">Qin Xu</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+J">Jin Tang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.07854v1-abstract-short" style="display: inline;"> Adapter-based tuning methods have shown significant potential in transferring knowledge from pre-trained Vision-Language Models to the downstream tasks. However, after reviewing existing adapters, we find they generally fail to fully explore the interactions between different modalities in constructing task-specific knowledge. Also, existing works usually only focus on similarity matching between… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07854v1-abstract-full').style.display = 'inline'; document.getElementById('2410.07854v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.07854v1-abstract-full" style="display: none;"> Adapter-based tuning methods have shown significant potential in transferring knowledge from pre-trained Vision-Language Models to the downstream tasks. However, after reviewing existing adapters, we find they generally fail to fully explore the interactions between different modalities in constructing task-specific knowledge. Also, existing works usually only focus on similarity matching between positive text prompts, making it challenging to distinguish the classes with high similar visual contents. To address these issues, in this paper, we propose a novel Heterogeneous Graph Adapter to achieve tuning VLMs for the downstream tasks. To be specific, we first construct a unified heterogeneous graph mode, which contains i) visual nodes, positive text nodes and negative text nodes, and ii) several types of edge connections to comprehensively model the intra-modality, inter-modality and inter-class structure knowledge together. Next, we employ a specific Heterogeneous Graph Neural Network to excavate multi-modality structure knowledge for adapting both visual and textual features for the downstream tasks. Finally, after HeGraphAdapter, we construct both text-based and visual-based classifiers simultaneously to comprehensively enhance the performance of the CLIP model. Experimental results on 11 benchmark datasets demonstrate the effectiveness and benefits of the proposed HeGraphAdapter. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07854v1-abstract-full').style.display = 'none'; document.getElementById('2410.07854v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.04616">arXiv:2410.04616</a> <span> [<a href="https://arxiv.org/pdf/2410.04616">pdf</a>, <a href="https://arxiv.org/format/2410.04616">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> LRQ-Fact: LLM-Generated Relevant Questions for Multimodal Fact-Checking </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Beigi%2C+A">Alimohammad Beigi</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+B">Bohan Jiang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Dawei Li</a>, <a href="/search/cs?searchtype=author&query=Kumarage%2C+T">Tharindu Kumarage</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+Z">Zhen Tan</a>, <a href="/search/cs?searchtype=author&query=Shaeri%2C+P">Pouya Shaeri</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Huan Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.04616v1-abstract-short" style="display: inline;"> Human fact-checkers have specialized domain knowledge that allows them to formulate precise questions to verify information accuracy. However, this expert-driven approach is labor-intensive and is not scalable, especially when dealing with complex multimodal misinformation. In this paper, we propose a fully-automated framework, LRQ-Fact, for multimodal fact-checking. Firstly, the framework leverag… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04616v1-abstract-full').style.display = 'inline'; document.getElementById('2410.04616v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.04616v1-abstract-full" style="display: none;"> Human fact-checkers have specialized domain knowledge that allows them to formulate precise questions to verify information accuracy. However, this expert-driven approach is labor-intensive and is not scalable, especially when dealing with complex multimodal misinformation. In this paper, we propose a fully-automated framework, LRQ-Fact, for multimodal fact-checking. Firstly, the framework leverages Vision-Language Models (VLMs) and Large Language Models (LLMs) to generate comprehensive questions and answers for probing multimodal content. Next, a rule-based decision-maker module evaluates both the original content and the generated questions and answers to assess the overall veracity. Extensive experiments on two benchmarks show that LRQ-Fact improves detection accuracy for multimodal misinformation. Moreover, we evaluate its generalizability across different model backbones, offering valuable insights for further refinement. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04616v1-abstract-full').style.display = 'none'; document.getElementById('2410.04616v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.03026">arXiv:2410.03026</a> <span> [<a href="https://arxiv.org/pdf/2410.03026">pdf</a>, <a href="https://arxiv.org/format/2410.03026">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Characterizing Context Influence and Hallucination in Summarization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Flemings%2C+J">James Flemings</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wanrong Zhang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+B">Bo Jiang</a>, <a href="/search/cs?searchtype=author&query=Takhirov%2C+Z">Zafar Takhirov</a>, <a href="/search/cs?searchtype=author&query=Annavaram%2C+M">Murali Annavaram</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.03026v1-abstract-short" style="display: inline;"> Although Large Language Models (LLMs) have achieved remarkable performance in numerous downstream tasks, their ubiquity has raised two significant concerns. One is that LLMs can hallucinate by generating content that contradicts relevant contextual information; the other is that LLMs can inadvertently leak private information due to input regurgitation. Many prior works have extensively studied ea… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03026v1-abstract-full').style.display = 'inline'; document.getElementById('2410.03026v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.03026v1-abstract-full" style="display: none;"> Although Large Language Models (LLMs) have achieved remarkable performance in numerous downstream tasks, their ubiquity has raised two significant concerns. One is that LLMs can hallucinate by generating content that contradicts relevant contextual information; the other is that LLMs can inadvertently leak private information due to input regurgitation. Many prior works have extensively studied each concern independently, but none have investigated them simultaneously. Furthermore, auditing the influence of provided context during open-ended generation with a privacy emphasis is understudied. To this end, we comprehensively characterize the influence and hallucination of contextual information during summarization. We introduce a definition for context influence and Context-Influence Decoding (CID), and then we show that amplifying the context (by factoring out prior knowledge) and the context being out of distribution with respect to prior knowledge increases the context's influence on an LLM. Moreover, we show that context influence gives a lower bound of the private information leakage of CID. We corroborate our analytical findings with experimental evaluations that show improving the F1 ROGUE-L score on CNN-DM for LLaMA 3 by $\textbf{10}$% over regular decoding also leads to $\textbf{1.5x}$ more influence by the context. Moreover, we empirically evaluate how context influence and hallucination are affected by (1) model capacity, (2) context size, (3) the length of the current response, and (4) different token $n$-grams of the context. Our code can be accessed here: https://github.com/james-flemings/context_influence. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03026v1-abstract-full').style.display = 'none'; document.getElementById('2410.03026v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.00982">arXiv:2410.00982</a> <span> [<a href="https://arxiv.org/pdf/2410.00982">pdf</a>, <a href="https://arxiv.org/format/2410.00982">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ScVLM: a Vision-Language Model for Driving Safety Critical Event Understanding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shi%2C+L">Liang Shi</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+B">Boyu Jiang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+F">Feng Guo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.00982v1-abstract-short" style="display: inline;"> Accurately identifying, understanding, and describing driving safety-critical events (SCEs), including crashes and near-crashes, is crucial for traffic safety, automated driving systems, and advanced driver assistance systems research and application. As SCEs are rare events, most general Vision-Language Models (VLMs) have not been trained sufficiently to link SCE videos and narratives, which coul… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00982v1-abstract-full').style.display = 'inline'; document.getElementById('2410.00982v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.00982v1-abstract-full" style="display: none;"> Accurately identifying, understanding, and describing driving safety-critical events (SCEs), including crashes and near-crashes, is crucial for traffic safety, automated driving systems, and advanced driver assistance systems research and application. As SCEs are rare events, most general Vision-Language Models (VLMs) have not been trained sufficiently to link SCE videos and narratives, which could lead to hallucination and missing key safety characteristics. To tackle these challenges, we propose ScVLM, a hybrid approach that combines supervised learning and contrastive learning to improve driving video understanding and event description rationality for VLMs. The proposed approach is trained on and evaluated by more than 8,600 SCEs from the Second Strategic Highway Research Program Naturalistic Driving Study dataset, the largest publicly accessible driving dataset with videos and SCE annotations. The results demonstrate the superiority of the proposed approach in generating contextually accurate event descriptions and mitigate hallucinations from VLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00982v1-abstract-full').style.display = 'none'; document.getElementById('2410.00982v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.00379">arXiv:2410.00379</a> <span> [<a href="https://arxiv.org/pdf/2410.00379">pdf</a>, <a href="https://arxiv.org/format/2410.00379">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> CXPMRG-Bench: Pre-training and Benchmarking for X-ray Medical Report Generation on CheXpert Plus Dataset </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiao Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+F">Fuling Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yuehang Li</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Q">Qingchuan Ma</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shiao Wang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+B">Bo Jiang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chuanfu Li</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+J">Jin Tang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.00379v1-abstract-short" style="display: inline;"> X-ray image-based medical report generation (MRG) is a pivotal area in artificial intelligence which can significantly reduce diagnostic burdens and patient wait times. Despite significant progress, we believe that the task has reached a bottleneck due to the limited benchmark datasets and the existing large models' insufficient capability enhancements in this specialized domain. Specifically, the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00379v1-abstract-full').style.display = 'inline'; document.getElementById('2410.00379v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.00379v1-abstract-full" style="display: none;"> X-ray image-based medical report generation (MRG) is a pivotal area in artificial intelligence which can significantly reduce diagnostic burdens and patient wait times. Despite significant progress, we believe that the task has reached a bottleneck due to the limited benchmark datasets and the existing large models' insufficient capability enhancements in this specialized domain. Specifically, the recently released CheXpert Plus dataset lacks comparative evaluation algorithms and their results, providing only the dataset itself. This situation makes the training, evaluation, and comparison of subsequent algorithms challenging. Thus, we conduct a comprehensive benchmarking of existing mainstream X-ray report generation models and large language models (LLMs), on the CheXpert Plus dataset. We believe that the proposed benchmark can provide a solid comparative basis for subsequent algorithms and serve as a guide for researchers to quickly grasp the state-of-the-art models in this field. More importantly, we propose a large model for the X-ray image report generation using a multi-stage pre-training strategy, including self-supervised autoregressive generation and Xray-report contrastive learning, and supervised fine-tuning. Extensive experimental results indicate that the autoregressive pre-training based on Mamba effectively encodes X-ray images, and the image-text contrastive pre-training further aligns the feature spaces, achieving better experimental results. Source code can be found on \url{https://github.com/Event-AHU/Medical_Image_Analysis}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00379v1-abstract-full').style.display = 'none'; document.getElementById('2410.00379v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">In Peer Review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.18486">arXiv:2409.18486</a> <span> [<a href="https://arxiv.org/pdf/2409.18486">pdf</a>, <a href="https://arxiv.org/format/2409.18486">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Evaluation of OpenAI o1: Opportunities and Challenges of AGI </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhong%2C+T">Tianyang Zhong</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhengliang Liu</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+Y">Yi Pan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yutong Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yifan Zhou</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+S">Shizhe Liang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Zihao Wu</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+Y">Yanjun Lyu</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+P">Peng Shu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+X">Xiaowei Yu</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+C">Chao Cao</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+H">Hanqi Jiang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hanxu Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yiwei Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Junhao Chen</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+H">Huawen Hu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yihen Liu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Huaqin Zhao</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+S">Shaochen Xu</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+H">Haixing Dai</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+L">Lin Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Ruidong Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+W">Wei Zhao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhenyuan Yang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jingyuan Chen</a> , et al. (53 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.18486v1-abstract-short" style="display: inline;"> This comprehensive study evaluates the performance of OpenAI's o1-preview large language model across a diverse array of complex reasoning tasks, spanning multiple domains, including computer science, mathematics, natural sciences, medicine, linguistics, and social sciences. Through rigorous testing, o1-preview demonstrated remarkable capabilities, often achieving human-level or superior performan… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.18486v1-abstract-full').style.display = 'inline'; document.getElementById('2409.18486v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.18486v1-abstract-full" style="display: none;"> This comprehensive study evaluates the performance of OpenAI's o1-preview large language model across a diverse array of complex reasoning tasks, spanning multiple domains, including computer science, mathematics, natural sciences, medicine, linguistics, and social sciences. Through rigorous testing, o1-preview demonstrated remarkable capabilities, often achieving human-level or superior performance in areas ranging from coding challenges to scientific reasoning and from language processing to creative problem-solving. Key findings include: -83.3% success rate in solving complex competitive programming problems, surpassing many human experts. -Superior ability in generating coherent and accurate radiology reports, outperforming other evaluated models. -100% accuracy in high school-level mathematical reasoning tasks, providing detailed step-by-step solutions. -Advanced natural language inference capabilities across general and specialized domains like medicine. -Impressive performance in chip design tasks, outperforming specialized models in areas such as EDA script generation and bug analysis. -Remarkable proficiency in anthropology and geology, demonstrating deep understanding and reasoning in these specialized fields. -Strong capabilities in quantitative investing. O1 has comprehensive financial knowledge and statistical modeling skills. -Effective performance in social media analysis, including sentiment analysis and emotion recognition. The model excelled particularly in tasks requiring intricate reasoning and knowledge integration across various fields. While some limitations were observed, including occasional errors on simpler problems and challenges with certain highly specialized concepts, the overall results indicate significant progress towards artificial general intelligence. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.18486v1-abstract-full').style.display = 'none'; document.getElementById('2409.18486v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.17728">arXiv:2409.17728</a> <span> [<a href="https://arxiv.org/pdf/2409.17728">pdf</a>, <a href="https://arxiv.org/format/2409.17728">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> AlterMOMA: Fusion Redundancy Pruning for Camera-LiDAR Fusion Models with Alternative Modality Masking </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+S">Shiqi Sun</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Y">Yantao Lu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+N">Ning Liu</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+B">Bo Jiang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">JinChao Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Ying Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.17728v1-abstract-short" style="display: inline;"> Camera-LiDAR fusion models significantly enhance perception performance in autonomous driving. The fusion mechanism leverages the strengths of each modality while minimizing their weaknesses. Moreover, in practice, camera-LiDAR fusion models utilize pre-trained backbones for efficient training. However, we argue that directly loading single-modal pre-trained camera and LiDAR backbones into camera-… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.17728v1-abstract-full').style.display = 'inline'; document.getElementById('2409.17728v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.17728v1-abstract-full" style="display: none;"> Camera-LiDAR fusion models significantly enhance perception performance in autonomous driving. The fusion mechanism leverages the strengths of each modality while minimizing their weaknesses. Moreover, in practice, camera-LiDAR fusion models utilize pre-trained backbones for efficient training. However, we argue that directly loading single-modal pre-trained camera and LiDAR backbones into camera-LiDAR fusion models introduces similar feature redundancy across modalities due to the nature of the fusion mechanism. Unfortunately, existing pruning methods are developed explicitly for single-modal models, and thus, they struggle to effectively identify these specific redundant parameters in camera-LiDAR fusion models. In this paper, to address the issue above on camera-LiDAR fusion models, we propose a novelty pruning framework Alternative Modality Masking Pruning (AlterMOMA), which employs alternative masking on each modality and identifies the redundant parameters. Specifically, when one modality parameters are masked (deactivated), the absence of features from the masked backbone compels the model to reactivate previous redundant features of the other modality backbone. Therefore, these redundant features and relevant redundant parameters can be identified via the reactivation process. The redundant parameters can be pruned by our proposed importance score evaluation function, Alternative Evaluation (AlterEva), which is based on the observation of the loss changes when certain modality parameters are activated and deactivated. Extensive experiments on the nuScene and KITTI datasets encompassing diverse tasks, baseline models, and pruning algorithms showcase that AlterMOMA outperforms existing pruning methods, attaining state-of-the-art performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.17728v1-abstract-full').style.display = 'none'; document.getElementById('2409.17728v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages, 3 figures, Accepted by NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.14115">arXiv:2409.14115</a> <span> [<a href="https://arxiv.org/pdf/2409.14115">pdf</a>, <a href="https://arxiv.org/format/2409.14115">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Aerial Grasping with Soft Aerial Vehicle Using Disturbance Observer-Based Model Predictive Control </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cheung%2C+H+C">Hiu Ching Cheung</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+B">Bailun Jiang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yang Hu</a>, <a href="/search/cs?searchtype=author&query=Chu%2C+H+K">Henry K. Chu</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+C">Chih-Yung Wen</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+C">Ching-Wei Chang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.14115v1-abstract-short" style="display: inline;"> Aerial grasping, particularly soft aerial grasping, holds significant promise for drone delivery and harvesting tasks. However, controlling UAV dynamics during aerial grasping presents considerable challenges. The increased mass during payload grasping adversely affects thrust prediction, while unpredictable environmental disturbances further complicate control efforts. In this study, our objectiv… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.14115v1-abstract-full').style.display = 'inline'; document.getElementById('2409.14115v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.14115v1-abstract-full" style="display: none;"> Aerial grasping, particularly soft aerial grasping, holds significant promise for drone delivery and harvesting tasks. However, controlling UAV dynamics during aerial grasping presents considerable challenges. The increased mass during payload grasping adversely affects thrust prediction, while unpredictable environmental disturbances further complicate control efforts. In this study, our objective aims to enhance the control of the Soft Aerial Vehicle (SAV) during aerial grasping by incorporating a disturbance observer into a Nonlinear Model Predictive Control (NMPC) SAV controller. By integrating the disturbance observer into the NMPC SAV controller, we aim to compensate for dynamic model idealization and uncertainties arising from additional payloads and unpredictable disturbances. Our approach combines a disturbance observer-based NMPC with the SAV controller, effectively minimizing tracking errors and enabling precise aerial grasping along all three axes. The proposed SAV equipped with Disturbance Observer-based Nonlinear Model Predictive Control (DOMPC) demonstrates remarkable capabilities in handling both static and non-static payloads, leading to the successful grasping of various objects. Notably, our SAV achieves an impressive payload-to-weight ratio, surpassing previous investigations in the domain of soft grasping. Using the proposed soft aerial vehicle weighing 1.002 kg, we achieve a maximum payload of 337 g by grasping. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.14115v1-abstract-full').style.display = 'none'; document.getElementById('2409.14115v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 10 figures, submitted to IEEE Robotics Automation Letters</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.06741">arXiv:2409.06741</a> <span> [<a href="https://arxiv.org/pdf/2409.06741">pdf</a>, <a href="https://arxiv.org/format/2409.06741">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Generative AI for Requirements Engineering: A Systematic Literature Review </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cheng%2C+H">Haowei Cheng</a>, <a href="/search/cs?searchtype=author&query=Husen%2C+J+H">Jati H. Husen</a>, <a href="/search/cs?searchtype=author&query=Peralta%2C+S+R">Sien Reeve Peralta</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+B">Bowen Jiang</a>, <a href="/search/cs?searchtype=author&query=Yoshioka%2C+N">Nobukazu Yoshioka</a>, <a href="/search/cs?searchtype=author&query=Ubayashi%2C+N">Naoyasu Ubayashi</a>, <a href="/search/cs?searchtype=author&query=Washizaki%2C+H">Hironori Washizaki</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.06741v1-abstract-short" style="display: inline;"> Context: Generative AI (GenAI) has emerged as a transformative tool in software engineering, with requirements engineering (RE) actively exploring its potential to revolutionize processes and outcomes. The integration of GenAI into RE presents both promising opportunities and significant challenges that necessitate systematic analysis and evaluation. Objective: This paper presents a comprehensive… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.06741v1-abstract-full').style.display = 'inline'; document.getElementById('2409.06741v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.06741v1-abstract-full" style="display: none;"> Context: Generative AI (GenAI) has emerged as a transformative tool in software engineering, with requirements engineering (RE) actively exploring its potential to revolutionize processes and outcomes. The integration of GenAI into RE presents both promising opportunities and significant challenges that necessitate systematic analysis and evaluation. Objective: This paper presents a comprehensive systematic literature review (SLR) analyzing state-of-the-art applications and innovative proposals leveraging GenAI in RE. It surveys studies focusing on the utilization of GenAI to enhance RE processes while identifying key challenges and opportunities in this rapidly evolving field. Method: A rigorous SLR methodology was used to analyze 27 carefully selected primary studies in-depth. The review examined research questions pertaining to the application of GenAI across various RE phases, the models and techniques used, and the challenges encountered in implementation and adoption. Results: The most salient findings include i) a predominant focus on the early stages of RE, particularly the elicitation and analysis of requirements, indicating potential for expansion into later phases; ii) the dominance of large language models, especially the GPT series, highlighting the need for diverse AI approaches; and iii) persistent challenges in domain-specific applications and the interpretability of AI-generated outputs, underscoring areas requiring further research and development. Conclusions: The results highlight the critical need for comprehensive evaluation frameworks, improved human-AI collaboration models, and thorough consideration of ethical implications in GenAI-assisted RE. Future research should prioritize extending GenAI applications across the entire RE lifecycle, enhancing domain-specific capabilities, and developing strategies for responsible AI integration in RE practices. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.06741v1-abstract-full').style.display = 'none'; document.getElementById('2409.06741v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.06299">arXiv:2409.06299</a> <span> [<a href="https://arxiv.org/pdf/2409.06299">pdf</a>, <a href="https://arxiv.org/format/2409.06299">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Long Video Understanding via Hierarchical Event-Based Memory </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cheng%2C+D">Dingxin Cheng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Mingda Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jingyu Liu</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Y">Yongxin Guo</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+B">Bin Jiang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qingbin Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xi Chen</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+B">Bo Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.06299v1-abstract-short" style="display: inline;"> Recently, integrating visual foundation models into large language models (LLMs) to form video understanding systems has attracted widespread attention. Most of the existing models compress diverse semantic information within the whole video and feed it into LLMs for content comprehension. While this method excels in short video understanding, it may result in a blend of multiple event information… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.06299v1-abstract-full').style.display = 'inline'; document.getElementById('2409.06299v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.06299v1-abstract-full" style="display: none;"> Recently, integrating visual foundation models into large language models (LLMs) to form video understanding systems has attracted widespread attention. Most of the existing models compress diverse semantic information within the whole video and feed it into LLMs for content comprehension. While this method excels in short video understanding, it may result in a blend of multiple event information in long videos due to coarse compression, which causes information redundancy. Consequently, the semantics of key events might be obscured within the vast information that hinders the model's understanding capabilities. To address this issue, we propose a Hierarchical Event-based Memory-enhanced LLM (HEM-LLM) for better understanding of long videos. Firstly, we design a novel adaptive sequence segmentation scheme to divide multiple events within long videos. In this way, we can perform individual memory modeling for each event to establish intra-event contextual connections, thereby reducing information redundancy. Secondly, while modeling current event, we compress and inject the information of the previous event to enhance the long-term inter-event dependencies in videos. Finally, we perform extensive experiments on various video understanding tasks and the results show that our model achieves state-of-the-art performances. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.06299v1-abstract-full').style.display = 'none'; document.getElementById('2409.06299v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.04768">arXiv:2409.04768</a> <span> [<a href="https://arxiv.org/pdf/2409.04768">pdf</a>, <a href="https://arxiv.org/format/2409.04768">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Medical Image Segmentation via Single-Source Domain Generalization with Random Amplitude Spectrum Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qiao%2C+Q">Qiang Qiao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wenyu Wang</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+M">Meixia Qu</a>, <a href="/search/cs?searchtype=author&query=Su%2C+K">Kun Su</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+B">Bin Jiang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Q">Qiang Guo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.04768v1-abstract-short" style="display: inline;"> The field of medical image segmentation is challenged by domain generalization (DG) due to domain shifts in clinical datasets. The DG challenge is exacerbated by the scarcity of medical data and privacy concerns. Traditional single-source domain generalization (SSDG) methods primarily rely on stacking data augmentation techniques to minimize domain discrepancies. In this paper, we propose Random A… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.04768v1-abstract-full').style.display = 'inline'; document.getElementById('2409.04768v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.04768v1-abstract-full" style="display: none;"> The field of medical image segmentation is challenged by domain generalization (DG) due to domain shifts in clinical datasets. The DG challenge is exacerbated by the scarcity of medical data and privacy concerns. Traditional single-source domain generalization (SSDG) methods primarily rely on stacking data augmentation techniques to minimize domain discrepancies. In this paper, we propose Random Amplitude Spectrum Synthesis (RASS) as a training augmentation for medical images. RASS enhances model generalization by simulating distribution changes from a frequency perspective. This strategy introduces variability by applying amplitude-dependent perturbations to ensure broad coverage of potential domain variations. Furthermore, we propose random mask shuffle and reconstruction components, which can enhance the ability of the backbone to process structural information and increase resilience intra- and cross-domain changes. The proposed Random Amplitude Spectrum Synthesis for Single-Source Domain Generalization (RAS^4DG) is validated on 3D fetal brain images and 2D fundus photography, and achieves an improved DG segmentation performance compared to other SSDG models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.04768v1-abstract-full').style.display = 'none'; document.getElementById('2409.04768v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 4 figures, Medical Image Computing and Computer Assisted Intervention 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.02834">arXiv:2409.02834</a> <span> [<a href="https://arxiv.org/pdf/2409.02834">pdf</a>, <a href="https://arxiv.org/format/2409.02834">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> CMM-Math: A Chinese Multimodal Math Dataset To Evaluate and Enhance the Mathematics Reasoning of Large Multimodal Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+W">Wentao Liu</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+Q">Qianjun Pan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yi Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhuo Liu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Ji Wu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jie Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+A">Aimin Zhou</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Q">Qin Chen</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+B">Bo Jiang</a>, <a href="/search/cs?searchtype=author&query=He%2C+L">Liang He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.02834v3-abstract-short" style="display: inline;"> Large language models (LLMs) have obtained promising results in mathematical reasoning, which is a foundational skill for human intelligence. Most previous studies focus on improving and measuring the performance of LLMs based on textual math reasoning datasets (e.g., MATH, GSM8K). Recently, a few researchers have released English multimodal math datasets (e.g., MATHVISTA and MATH-V) to evaluate t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.02834v3-abstract-full').style.display = 'inline'; document.getElementById('2409.02834v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.02834v3-abstract-full" style="display: none;"> Large language models (LLMs) have obtained promising results in mathematical reasoning, which is a foundational skill for human intelligence. Most previous studies focus on improving and measuring the performance of LLMs based on textual math reasoning datasets (e.g., MATH, GSM8K). Recently, a few researchers have released English multimodal math datasets (e.g., MATHVISTA and MATH-V) to evaluate the effectiveness of large multimodal models (LMMs). In this paper, we release a Chinese multimodal math (CMM-Math) dataset, including benchmark and training parts, to evaluate and enhance the mathematical reasoning of LMMs. CMM-Math contains over 28,000 high-quality samples, featuring a variety of problem types (e.g., multiple-choice, fill-in-the-blank, and so on) with detailed solutions across 12 grade levels from elementary to high school in China. Specifically, the visual context may be present in the questions or opinions, which makes this dataset more challenging. Through comprehensive analysis, we discover that state-of-the-art LMMs on the CMM-Math dataset face challenges, emphasizing the necessity for further improvements in LMM development. We also propose a Multimodal Mathematical LMM (Math-LMM) to handle the problems with mixed input of multiple images and text segments. We train our model using three stages, including foundational pre-training, foundational fine-tuning, and mathematical fine-tuning. The extensive experiments indicate that our model effectively improves math reasoning performance by comparing it with the SOTA LMMs over three multimodal mathematical datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.02834v3-abstract-full').style.display = 'none'; document.getElementById('2409.02834v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.00968">arXiv:2409.00968</a> <span> [<a href="https://arxiv.org/pdf/2409.00968">pdf</a>, <a href="https://arxiv.org/format/2409.00968">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Solving Integrated Process Planning and Scheduling Problem via Graph Neural Network Based Deep Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+H">Hongpei Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Han Zhang</a>, <a href="/search/cs?searchtype=author&query=He%2C+Z">Ziyan He</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+Y">Yunkai Jia</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+B">Bo Jiang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+X">Xiang Huang</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+D">Dongdong Ge</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.00968v1-abstract-short" style="display: inline;"> The Integrated Process Planning and Scheduling (IPPS) problem combines process route planning and shop scheduling to achieve high efficiency in manufacturing and maximize resource utilization, which is crucial for modern manufacturing systems. Traditional methods using Mixed Integer Linear Programming (MILP) and heuristic algorithms can not well balance solution quality and speed when solving IPPS… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00968v1-abstract-full').style.display = 'inline'; document.getElementById('2409.00968v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.00968v1-abstract-full" style="display: none;"> The Integrated Process Planning and Scheduling (IPPS) problem combines process route planning and shop scheduling to achieve high efficiency in manufacturing and maximize resource utilization, which is crucial for modern manufacturing systems. Traditional methods using Mixed Integer Linear Programming (MILP) and heuristic algorithms can not well balance solution quality and speed when solving IPPS. In this paper, we propose a novel end-to-end Deep Reinforcement Learning (DRL) method. We model the IPPS problem as a Markov Decision Process (MDP) and employ a Heterogeneous Graph Neural Network (GNN) to capture the complex relationships among operations, machines, and jobs. To optimize the scheduling strategy, we use Proximal Policy Optimization (PPO). Experimental results show that, compared to traditional methods, our approach significantly improves solution efficiency and quality in large-scale IPPS instances, providing superior scheduling strategies for modern intelligent manufacturing systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00968v1-abstract-full').style.display = 'none'; document.getElementById('2409.00968v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">24 pages, 13 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.15018">arXiv:2408.15018</a> <span> [<a href="https://arxiv.org/pdf/2408.15018">pdf</a>, <a href="https://arxiv.org/format/2408.15018">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Cross-subject Brain Functional Connectivity Analysis for Multi-task Cognitive State Evaluation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jun Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+A">Anqi Chen</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+B">Bingkun Jiang</a>, <a href="/search/cs?searchtype=author&query=Obaidat%2C+M+S">Mohammad S. Obaidat</a>, <a href="/search/cs?searchtype=author&query=Li%2C+N">Ni Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xinyu Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.15018v1-abstract-short" style="display: inline;"> Cognition refers to the function of information perception and processing, which is the fundamental psychological essence of human beings. It is responsible for reasoning and decision-making, while its evaluation is significant for the aviation domain in mitigating potential safety risks. Existing studies tend to use varied methods for cognitive state evaluation yet have limitations in timeliness,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.15018v1-abstract-full').style.display = 'inline'; document.getElementById('2408.15018v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.15018v1-abstract-full" style="display: none;"> Cognition refers to the function of information perception and processing, which is the fundamental psychological essence of human beings. It is responsible for reasoning and decision-making, while its evaluation is significant for the aviation domain in mitigating potential safety risks. Existing studies tend to use varied methods for cognitive state evaluation yet have limitations in timeliness, generalisation, and interpretability. Accordingly, this study adopts brain functional connectivity with electroencephalography signals to capture associations in brain regions across multiple subjects for evaluating real-time cognitive states. Specifically, a virtual reality-based flight platform is constructed with multi-screen embedded. Three distinctive cognitive tasks are designed and each has three degrees of difficulty. Thirty subjects are acquired for analysis and evaluation. The results are interpreted through different perspectives, including inner-subject and cross-subject for task-wise and gender-wise underlying brain functional connectivity. Additionally, this study incorporates questionnaire-based, task performance-based, and physiological measure-based approaches to fairly label the trials. A multi-class cognitive state evaluation is further conducted with the active brain connections. Benchmarking results demonstrate that the identified brain regions have considerable influences in cognition, with a multi-class accuracy rate of 95.83% surpassing existing studies. The derived findings bring significance to understanding the dynamic relationships among human brain functional regions, cross-subject cognitive behaviours, and decision-making, which have promising practical application values. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.15018v1-abstract-full').style.display = 'none'; document.getElementById('2408.15018v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.14122">arXiv:2408.14122</a> <span> [<a href="https://arxiv.org/pdf/2408.14122">pdf</a>, <a href="https://arxiv.org/format/2408.14122">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> FG-SAT: Efficient Flow Graph for Encrypted Traffic Classification under Environment Shifts </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cui%2C+S">Susu Cui</a>, <a href="/search/cs?searchtype=author&query=Han%2C+X">Xueying Han</a>, <a href="/search/cs?searchtype=author&query=Han%2C+D">Dongqi Han</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhiliang Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Weihang Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yun Li</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+B">Bo Jiang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+B">Baoxu Liu</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Z">Zhigang Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.14122v1-abstract-short" style="display: inline;"> Encrypted traffic classification plays a critical role in network security and management. Currently, mining deep patterns from side-channel contents and plaintext fields through neural networks is a major solution. However, existing methods have two major limitations: (1) They fail to recognize the critical link between transport layer mechanisms and applications, missing the opportunity to learn… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.14122v1-abstract-full').style.display = 'inline'; document.getElementById('2408.14122v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.14122v1-abstract-full" style="display: none;"> Encrypted traffic classification plays a critical role in network security and management. Currently, mining deep patterns from side-channel contents and plaintext fields through neural networks is a major solution. However, existing methods have two major limitations: (1) They fail to recognize the critical link between transport layer mechanisms and applications, missing the opportunity to learn internal structure features for accurate traffic classification. (2) They assume network traffic in an unrealistically stable and singular environment, making it difficult to effectively classify real-world traffic under environment shifts. In this paper, we propose FG-SAT, the first end-to-end method for encrypted traffic analysis under environment shifts. We propose a key abstraction, the Flow Graph, to represent flow internal relationship structures and rich node attributes, which enables robust and generalized representation. Additionally, to address the problem of inconsistent data distribution under environment shifts, we introduce a novel feature selection algorithm based on Jensen-Shannon divergence (JSD) to select robust node attributes. Finally, we design a classifier, GraphSAT, which integrates GraphSAGE and GAT to deeply learn Flow Graph features, enabling accurate encrypted traffic identification. FG-SAT exhibits both efficient and robust classification performance under environment shifts and outperforms state-of-the-art methods in encrypted attack detection and application classification. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.14122v1-abstract-full').style.display = 'none'; document.getElementById('2408.14122v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Ready to submit to IEEE Transactions on Information Forensics and Security (TIFS)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.12340">arXiv:2408.12340</a> <span> [<a href="https://arxiv.org/pdf/2408.12340">pdf</a>, <a href="https://arxiv.org/format/2408.12340">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> VTON-HandFit: Virtual Try-on for Arbitrary Hand Pose Guided by Hand Priors Embedding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liang%2C+Y">Yujie Liang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+X">Xiaobin Hu</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+B">Boyuan Jiang</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+D">Donghao Luo</a>, <a href="/search/cs?searchtype=author&query=WU%2C+K">Kai WU</a>, <a href="/search/cs?searchtype=author&query=Han%2C+W">Wenhui Han</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+T">Taisong Jin</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chengjie Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.12340v2-abstract-short" style="display: inline;"> Although diffusion-based image virtual try-on has made considerable progress, emerging approaches still struggle to effectively address the issue of hand occlusion (i.e., clothing regions occluded by the hand part), leading to a notable degradation of the try-on performance. To tackle this issue widely existing in real-world scenarios, we propose VTON-HandFit, leveraging the power of hand priors t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.12340v2-abstract-full').style.display = 'inline'; document.getElementById('2408.12340v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.12340v2-abstract-full" style="display: none;"> Although diffusion-based image virtual try-on has made considerable progress, emerging approaches still struggle to effectively address the issue of hand occlusion (i.e., clothing regions occluded by the hand part), leading to a notable degradation of the try-on performance. To tackle this issue widely existing in real-world scenarios, we propose VTON-HandFit, leveraging the power of hand priors to reconstruct the appearance and structure for hand occlusion cases. Firstly, we tailor a Handpose Aggregation Net using the ControlNet-based structure explicitly and adaptively encoding the global hand and pose priors. Besides, to fully exploit the hand-related structure and appearance information, we propose Hand-feature Disentanglement Embedding module to disentangle the hand priors into the hand structure-parametric and visual-appearance features, and customize a masked cross attention for further decoupled feature embedding. Lastly, we customize a hand-canny constraint loss to better learn the structure edge knowledge from the hand template of model image. VTON-HandFit outperforms the baselines in qualitative and quantitative evaluations on the public dataset and our self-collected hand-occlusion Handfit-3K dataset particularly for the arbitrary hand pose occlusion cases in real-world scenarios. The Code and dataset will be available at \url{https://github.com/VTON-HandFit/VTON-HandFit}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.12340v2-abstract-full').style.display = 'none'; document.getElementById('2408.12340v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The project page is \url{https://vton-handfit.github.io}</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.10488">arXiv:2408.10488</a> <span> [<a href="https://arxiv.org/pdf/2408.10488">pdf</a>, <a href="https://arxiv.org/format/2408.10488">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> </div> </div> <p class="title is-5 mathjax"> Event Stream based Sign Language Translation: A High-Definition Benchmark Dataset and A New Algorithm </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiao Wang</a>, <a href="/search/cs?searchtype=author&query=Rong%2C+Y">Yao Rong</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+F">Fuling Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jianing Li</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+L">Lin Zhu</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+B">Bo Jiang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yaowei Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.10488v1-abstract-short" style="display: inline;"> Sign Language Translation (SLT) is a core task in the field of AI-assisted disability. Unlike traditional SLT based on visible light videos, which is easily affected by factors such as lighting, rapid hand movements, and privacy breaches, this paper proposes the use of high-definition Event streams for SLT, effectively mitigating the aforementioned issues. This is primarily because Event streams h… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10488v1-abstract-full').style.display = 'inline'; document.getElementById('2408.10488v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.10488v1-abstract-full" style="display: none;"> Sign Language Translation (SLT) is a core task in the field of AI-assisted disability. Unlike traditional SLT based on visible light videos, which is easily affected by factors such as lighting, rapid hand movements, and privacy breaches, this paper proposes the use of high-definition Event streams for SLT, effectively mitigating the aforementioned issues. This is primarily because Event streams have a high dynamic range and dense temporal signals, which can withstand low illumination and motion blur well. Additionally, due to their sparsity in space, they effectively protect the privacy of the target person. More specifically, we propose a new high-resolution Event stream sign language dataset, termed Event-CSL, which effectively fills the data gap in this area of research. It contains 14,827 videos, 14,821 glosses, and 2,544 Chinese words in the text vocabulary. These samples are collected in a variety of indoor and outdoor scenes, encompassing multiple angles, light intensities, and camera movements. We have benchmarked existing mainstream SLT works to enable fair comparison for future efforts. Based on this dataset and several other large-scale datasets, we propose a novel baseline method that fully leverages the Mamba model's ability to integrate temporal information of CNN features, resulting in improved sign language translation outcomes. Both the benchmark dataset and source code will be released on https://github.com/Event-AHU/OpenESL <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10488v1-abstract-full').style.display = 'none'; document.getElementById('2408.10488v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">First Large-scale and High-Definition Benchmark Dataset for Event-based Sign Language Translation</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.10487">arXiv:2408.10487</a> <span> [<a href="https://arxiv.org/pdf/2408.10487">pdf</a>, <a href="https://arxiv.org/format/2408.10487">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> MambaEVT: Event Stream based Visual Object Tracking using State Space Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiao Wang</a>, <a href="/search/cs?searchtype=author&query=wang%2C+C">Chao wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shiao Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xixi Wang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Zhicheng Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+L">Lin Zhu</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+B">Bo Jiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.10487v1-abstract-short" style="display: inline;"> Event camera-based visual tracking has drawn more and more attention in recent years due to the unique imaging principle and advantages of low energy consumption, high dynamic range, and dense temporal resolution. Current event-based tracking algorithms are gradually hitting their performance bottlenecks, due to the utilization of vision Transformer and the static template for target object locali… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10487v1-abstract-full').style.display = 'inline'; document.getElementById('2408.10487v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.10487v1-abstract-full" style="display: none;"> Event camera-based visual tracking has drawn more and more attention in recent years due to the unique imaging principle and advantages of low energy consumption, high dynamic range, and dense temporal resolution. Current event-based tracking algorithms are gradually hitting their performance bottlenecks, due to the utilization of vision Transformer and the static template for target object localization. In this paper, we propose a novel Mamba-based visual tracking framework that adopts the state space model with linear complexity as a backbone network. The search regions and target template are fed into the vision Mamba network for simultaneous feature extraction and interaction. The output tokens of search regions will be fed into the tracking head for target localization. More importantly, we consider introducing a dynamic template update strategy into the tracking framework using the Memory Mamba network. By considering the diversity of samples in the target template library and making appropriate adjustments to the template memory module, a more effective dynamic template can be integrated. The effective combination of dynamic and static templates allows our Mamba-based tracking algorithm to achieve a good balance between accuracy and computational cost on multiple large-scale datasets, including EventVOT, VisEvent, and FE240hz. The source code will be released on https://github.com/Event-AHU/MambaEVT <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10487v1-abstract-full').style.display = 'none'; document.getElementById('2408.10487v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">In Peer Review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.09764">arXiv:2408.09764</a> <span> [<a href="https://arxiv.org/pdf/2408.09764">pdf</a>, <a href="https://arxiv.org/format/2408.09764">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> </div> </div> <p class="title is-5 mathjax"> Event Stream based Human Action Recognition: A High-Definition Benchmark Dataset and Algorithms </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiao Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shiao Wang</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+P">Pengpeng Shao</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+B">Bo Jiang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+L">Lin Zhu</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+Y">Yonghong Tian</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.09764v1-abstract-short" style="display: inline;"> Human Action Recognition (HAR) stands as a pivotal research domain in both computer vision and artificial intelligence, with RGB cameras dominating as the preferred tool for investigation and innovation in this field. However, in real-world applications, RGB cameras encounter numerous challenges, including light conditions, fast motion, and privacy concerns. Consequently, bio-inspired event camera… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.09764v1-abstract-full').style.display = 'inline'; document.getElementById('2408.09764v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.09764v1-abstract-full" style="display: none;"> Human Action Recognition (HAR) stands as a pivotal research domain in both computer vision and artificial intelligence, with RGB cameras dominating as the preferred tool for investigation and innovation in this field. However, in real-world applications, RGB cameras encounter numerous challenges, including light conditions, fast motion, and privacy concerns. Consequently, bio-inspired event cameras have garnered increasing attention due to their advantages of low energy consumption, high dynamic range, etc. Nevertheless, most existing event-based HAR datasets are low resolution ($346 \times 260$). In this paper, we propose a large-scale, high-definition ($1280 \times 800$) human action recognition dataset based on the CeleX-V event camera, termed CeleX-HAR. It encompasses 150 commonly occurring action categories, comprising a total of 124,625 video sequences. Various factors such as multi-view, illumination, action speed, and occlusion are considered when recording these data. To build a more comprehensive benchmark dataset, we report over 20 mainstream HAR models for future works to compare. In addition, we also propose a novel Mamba vision backbone network for event stream based HAR, termed EVMamba, which equips the spatial plane multi-directional scanning and novel voxel temporal scanning mechanism. By encoding and mining the spatio-temporal information of event streams, our EVMamba has achieved favorable results across multiple datasets. Both the dataset and source code will be released on \url{https://github.com/Event-AHU/CeleX-HAR} <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.09764v1-abstract-full').style.display = 'none'; document.getElementById('2408.09764v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">In Peer Review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.09743">arXiv:2408.09743</a> <span> [<a href="https://arxiv.org/pdf/2408.09743">pdf</a>, <a href="https://arxiv.org/format/2408.09743">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> R2GenCSR: Retrieving Context Samples for Large Language Model based X-ray Medical Report Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiao Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yuehang Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+F">Fuling Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shiao Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chuanfu Li</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+B">Bo Jiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.09743v1-abstract-short" style="display: inline;"> Inspired by the tremendous success of Large Language Models (LLMs), existing X-ray medical report generation methods attempt to leverage large models to achieve better performance. They usually adopt a Transformer to extract the visual features of a given X-ray image, and then, feed them into the LLM for text generation. How to extract more effective information for the LLMs to help them improve f… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.09743v1-abstract-full').style.display = 'inline'; document.getElementById('2408.09743v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.09743v1-abstract-full" style="display: none;"> Inspired by the tremendous success of Large Language Models (LLMs), existing X-ray medical report generation methods attempt to leverage large models to achieve better performance. They usually adopt a Transformer to extract the visual features of a given X-ray image, and then, feed them into the LLM for text generation. How to extract more effective information for the LLMs to help them improve final results is an urgent problem that needs to be solved. Additionally, the use of visual Transformer models also brings high computational complexity. To address these issues, this paper proposes a novel context-guided efficient X-ray medical report generation framework. Specifically, we introduce the Mamba as the vision backbone with linear complexity, and the performance obtained is comparable to that of the strong Transformer model. More importantly, we perform context retrieval from the training set for samples within each mini-batch during the training phase, utilizing both positively and negatively related samples to enhance feature representation and discriminative learning. Subsequently, we feed the vision tokens, context information, and prompt statements to invoke the LLM for generating high-quality medical reports. Extensive experiments on three X-ray report generation datasets (i.e., IU-Xray, MIMIC-CXR, CheXpert Plus) fully validated the effectiveness of our proposed model. The source code of this work will be released on \url{https://github.com/Event-AHU/Medical_Image_Analysis}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.09743v1-abstract-full').style.display = 'none'; document.getElementById('2408.09743v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">In Peer Review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.08078">arXiv:2408.08078</a> <span> [<a href="https://arxiv.org/pdf/2408.08078">pdf</a>, <a href="https://arxiv.org/format/2408.08078">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Treat Stillness with Movement: Remote Sensing Change Detection via Coarse-grained Temporal Foregrounds Mining </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xixi Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zitian Wang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+J">Jingtao Jiang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Lan Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiao Wang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+B">Bo Jiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.08078v1-abstract-short" style="display: inline;"> Current works focus on addressing the remote sensing change detection task using bi-temporal images. Although good performance can be achieved, however, seldom of they consider the motion cues which may also be vital. In this work, we revisit the widely adopted bi-temporal images-based framework and propose a novel Coarse-grained Temporal Mining Augmented (CTMA) framework. To be specific, given th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.08078v1-abstract-full').style.display = 'inline'; document.getElementById('2408.08078v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.08078v1-abstract-full" style="display: none;"> Current works focus on addressing the remote sensing change detection task using bi-temporal images. Although good performance can be achieved, however, seldom of they consider the motion cues which may also be vital. In this work, we revisit the widely adopted bi-temporal images-based framework and propose a novel Coarse-grained Temporal Mining Augmented (CTMA) framework. To be specific, given the bi-temporal images, we first transform them into a video using interpolation operations. Then, a set of temporal encoders is adopted to extract the motion features from the obtained video for coarse-grained changed region prediction. Subsequently, we design a novel Coarse-grained Foregrounds Augmented Spatial Encoder module to integrate both global and local information. We also introduce a motion augmented strategy that leverages motion cues as an additional output to aggregate with the spatial features for improved results. Meanwhile, we feed the input image pairs into the ResNet to get the different features and also the spatial blocks for fine-grained feature learning. More importantly, we propose a mask augmented strategy that utilizes coarse-grained changed regions, incorporating them into the decoder blocks to enhance the final changed prediction. Extensive experiments conducted on multiple benchmark datasets fully validated the effectiveness of our proposed framework for remote sensing image change detection. The source code of this paper will be released on https://github.com/Event-AHU/CTM_Remote_Sensing_Change_Detection <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.08078v1-abstract-full').style.display = 'none'; document.getElementById('2408.08078v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">In Peer Review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.03723">arXiv:2408.03723</a> <span> [<a href="https://arxiv.org/pdf/2408.03723">pdf</a>, <a href="https://arxiv.org/format/2408.03723">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> MS-Mapping: An Uncertainty-Aware Large-Scale Multi-Session LiDAR Mapping System </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+X">Xiangcheng Hu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jin Wu</a>, <a href="/search/cs?searchtype=author&query=Jiao%2C+J">Jianhao Jiao</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+B">Binqian Jiang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wei Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wenshuo Wang</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+P">Ping Tan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.03723v1-abstract-short" style="display: inline;"> Large-scale multi-session LiDAR mapping is essential for a wide range of applications, including surveying, autonomous driving, crowdsourced mapping, and multi-agent navigation. However, existing approaches often struggle with data redundancy, robustness, and accuracy in complex environments. To address these challenges, we present MS-Mapping, an novel multi-session LiDAR mapping system that emplo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.03723v1-abstract-full').style.display = 'inline'; document.getElementById('2408.03723v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.03723v1-abstract-full" style="display: none;"> Large-scale multi-session LiDAR mapping is essential for a wide range of applications, including surveying, autonomous driving, crowdsourced mapping, and multi-agent navigation. However, existing approaches often struggle with data redundancy, robustness, and accuracy in complex environments. To address these challenges, we present MS-Mapping, an novel multi-session LiDAR mapping system that employs an incremental mapping scheme for robust and accurate map assembly in large-scale environments. Our approach introduces three key innovations: 1) A distribution-aware keyframe selection method that captures the subtle contributions of each point cloud frame to the map by analyzing the similarity of map distributions. This method effectively reduces data redundancy and pose graph size, while enhancing graph optimization speed; 2) An uncertainty model that automatically performs least-squares adjustments according to the covariance matrix during graph optimization, improving mapping precision, robustness, and flexibility without the need for scene-specific parameter tuning. This uncertainty model enables our system to monitor pose uncertainty and avoid ill-posed optimizations, thereby increasing adaptability to diverse and challenging environments. 3) To ensure fair evaluation, we redesign baseline comparisons and the evaluation benchmark. Direct assessment of map accuracy demonstrates the superiority of the proposed MS-Mapping algorithm compared to state-of-the-art methods. In addition to employing public datasets such as Urban-Nav, FusionPortable, and Newer College, we conducted extensive experiments on such a large \SI{855}{m}$\times$\SI{636}{m} ground truth map, collecting over \SI{20}{km} of indoor and outdoor data across more than ten sequences... <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.03723v1-abstract-full').style.display = 'none'; document.getElementById('2408.03723v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">18 pages, 22 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.03519">arXiv:2408.03519</a> <span> [<a href="https://arxiv.org/pdf/2408.03519">pdf</a>, <a href="https://arxiv.org/format/2408.03519">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> RepoMasterEval: Evaluating Code Completion via Real-World Repositories </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+Q">Qinyun Wu</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+C">Chao Peng</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+P">Pengfei Gao</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+R">Ruida Hu</a>, <a href="/search/cs?searchtype=author&query=Gan%2C+H">Haoyu Gan</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+B">Bo Jiang</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+J">Jinhe Tang</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+Z">Zhiwen Deng</a>, <a href="/search/cs?searchtype=author&query=Guan%2C+Z">Zhanming Guan</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+C">Cuiyun Gao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xia Liu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+P">Ping Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.03519v1-abstract-short" style="display: inline;"> With the growing reliance on automated code completion tools in software development, the need for robust evaluation benchmarks has become critical. However, existing benchmarks focus more on code generation tasks in function and class level and provide rich text description to prompt the model. By contrast, such descriptive prompt is commonly unavailable in real development and code completion ca… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.03519v1-abstract-full').style.display = 'inline'; document.getElementById('2408.03519v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.03519v1-abstract-full" style="display: none;"> With the growing reliance on automated code completion tools in software development, the need for robust evaluation benchmarks has become critical. However, existing benchmarks focus more on code generation tasks in function and class level and provide rich text description to prompt the model. By contrast, such descriptive prompt is commonly unavailable in real development and code completion can occur in wider range of situations such as in the middle of a function or a code block. These limitations makes the evaluation poorly align with the practical scenarios of code completion tools. In this paper, we propose RepoMasterEval, a novel benchmark for evaluating code completion models constructed from real-world Python and TypeScript repositories. Each benchmark datum is generated by masking a code snippet (ground truth) from one source code file with existing test suites. To improve test accuracy of model generated code, we employ mutation testing to measure the effectiveness of the test cases and we manually crafted new test cases for those test suites with low mutation score. Our empirical evaluation on 6 state-of-the-art models shows that test argumentation is critical in improving the accuracy of the benchmark and RepoMasterEval is able to report difference in model performance in real-world scenarios. The deployment of RepoMasterEval in a collaborated company for one month also revealed that the benchmark is useful to give accurate feedback during model training and the score is in high correlation with the model's performance in practice. Based on our findings, we call for the software engineering community to build more LLM benchmarks tailored for code generation tools taking the practical and complex development environment into consideration. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.03519v1-abstract-full').style.display = 'none'; document.getElementById('2408.03519v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.02503">arXiv:2408.02503</a> <span> [<a href="https://arxiv.org/pdf/2408.02503">pdf</a>, <a href="https://arxiv.org/format/2408.02503">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> UnifiedMLLM: Enabling Unified Representation for Multi-modal Multi-tasks With Large Language Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhaowei Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wei Wang</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+Y">YiQing Cai</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+X">Xu Qi</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+P">Pengyu Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+D">Dong Zhang</a>, <a href="/search/cs?searchtype=author&query=Song%2C+H">Hang Song</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+B">Botian Jiang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Z">Zhida Huang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+T">Tao Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.02503v1-abstract-short" style="display: inline;"> Significant advancements has recently been achieved in the field of multi-modal large language models (MLLMs), demonstrating their remarkable capabilities in understanding and reasoning across diverse tasks. However, these models are often trained for specific tasks and rely on task-specific input-output formats, limiting their applicability to a broader range of tasks. This raises a fundamental q… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.02503v1-abstract-full').style.display = 'inline'; document.getElementById('2408.02503v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.02503v1-abstract-full" style="display: none;"> Significant advancements has recently been achieved in the field of multi-modal large language models (MLLMs), demonstrating their remarkable capabilities in understanding and reasoning across diverse tasks. However, these models are often trained for specific tasks and rely on task-specific input-output formats, limiting their applicability to a broader range of tasks. This raises a fundamental question: Can we develop a unified approach to represent and handle different multi-modal tasks to maximize the generalizability of MLLMs? In this paper, we propose UnifiedMLLM, a comprehensive model designed to represent various tasks using a unified representation. Our model exhibits strong capabilities in comprehending the implicit intent of user instructions and preforming reasoning. In addition to generating textual responses, our model also outputs task tokens and grounding tokens, serving as indicators of task types and task granularity. These outputs are subsequently routed through the task router and directed to specific expert models for task completion. To train our model, we construct a task-specific dataset and an 100k multi-task dataset encompassing complex scenarios. Employing a three-stage training strategy, we equip our model with robust reasoning and task processing capabilities while preserving its generalization capacity and knowledge reservoir. Extensive experiments showcase the impressive performance of our unified representation approach across various tasks, surpassing existing methodologies. Furthermore, our approach exhibits exceptional scalability and generality. Our code, model, and dataset will be available at \url{https://github.com/lzw-lzw/UnifiedMLLM}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.02503v1-abstract-full').style.display = 'none'; document.getElementById('2408.02503v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.01733">arXiv:2408.01733</a> <span> [<a href="https://arxiv.org/pdf/2408.01733">pdf</a>, <a href="https://arxiv.org/format/2408.01733">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3650212.3652142">10.1145/3650212.3652142 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> CoEdPilot: Recommending Code Edits with Learned Prior Edit Relevance, Project-wise Awareness, and Interactive Nature </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+C">Chenyan Liu</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+Y">Yufan Cai</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yun Lin</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yuhuan Huang</a>, <a href="/search/cs?searchtype=author&query=Pei%2C+Y">Yunrui Pei</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+B">Bo Jiang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+P">Ping Yang</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+J+S">Jin Song Dong</a>, <a href="/search/cs?searchtype=author&query=Mei%2C+H">Hong Mei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.01733v1-abstract-short" style="display: inline;"> Recent years have seen the development of LLM-based code generation. Compared to generating code in a software project, incremental code edits are empirically observed to be more frequent. The emerging code editing approaches usually formulate the problem as generating an edit based on known relevant prior edits and context. However, practical code edits can be more complicated. First, an editing… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.01733v1-abstract-full').style.display = 'inline'; document.getElementById('2408.01733v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.01733v1-abstract-full" style="display: none;"> Recent years have seen the development of LLM-based code generation. Compared to generating code in a software project, incremental code edits are empirically observed to be more frequent. The emerging code editing approaches usually formulate the problem as generating an edit based on known relevant prior edits and context. However, practical code edits can be more complicated. First, an editing session can include multiple (ir)relevant edits to the code under edit. Second, the inference of the subsequent edits is non-trivial as the scope of its ripple effect can be the whole project. In this work, we propose CoEdPilot, an LLM-driven solution to recommend code edits by discriminating the relevant edits, exploring their interactive natures, and estimating its ripple effect in the project. Specifically, CoEdPilot orchestrates multiple neural transformers to identify what and how to edit in the project regarding both edit location and edit content. When a user accomplishes an edit with an optional editing description, a Subsequent Edit Analysis first reports the most relevant files in the project with what types of edits (e.g., keep, insert, and replace) can happen for each line of their code. Next, an Edit-content Generator generates concrete edit options for the lines of code, regarding its relevant prior changes reported by an Edit-dependency Analyzer. Lastly, both the Subsequent Edit Analysis and the Edit-content Generator capture relevant prior edits as feedback to readjust their recommendations. We train our models by collecting over 180K commits from 471 open-source projects in 5 programming languages. Our extensive experiments show that CoEdPilot can well predict the edits (i.e., predicting edit location with an accuracy of 70.8%-85.3%, and the edit content with an exact match rate of 41.8% and BLEU4 score of 60.7)... <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.01733v1-abstract-full').style.display = 'none'; document.getElementById('2408.01733v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 7 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.17535">arXiv:2407.17535</a> <span> [<a href="https://arxiv.org/pdf/2407.17535">pdf</a>, <a href="https://arxiv.org/format/2407.17535">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> LAMBDA: A Large Model Based Data Agent </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+M">Maojun Sun</a>, <a href="/search/cs?searchtype=author&query=Han%2C+R">Ruijian Han</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+B">Binyan Jiang</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+H">Houduo Qi</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+D">Defeng Sun</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Yancheng Yuan</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jian Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.17535v2-abstract-short" style="display: inline;"> We introduce LArge Model Based Data Agent (LAMBDA), a novel open-source, code-free multi-agent data analysis system that leverages the power of large models. LAMBDA is designed to address data analysis challenges in complex data-driven applications through innovatively designed data agents that operate iteratively and generatively using natural language. At the core of LAMBDA are two key agent rol… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.17535v2-abstract-full').style.display = 'inline'; document.getElementById('2407.17535v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.17535v2-abstract-full" style="display: none;"> We introduce LArge Model Based Data Agent (LAMBDA), a novel open-source, code-free multi-agent data analysis system that leverages the power of large models. LAMBDA is designed to address data analysis challenges in complex data-driven applications through innovatively designed data agents that operate iteratively and generatively using natural language. At the core of LAMBDA are two key agent roles: the programmer and the inspector, which are engineered to work together seamlessly. Specifically, the programmer generates code based on the user's instructions and domain-specific knowledge, enhanced by advanced models. Meanwhile, the inspector debugs the code when necessary. To ensure robustness and handle adverse scenarios, LAMBDA features a user interface that allows direct user intervention in the operational loop. Additionally, LAMBDA can flexibly integrate external models and algorithms through our proposed Knowledge Integration Mechanism, catering to the needs of customized data analysis. LAMBDA has demonstrated strong performance on various data analysis tasks. It has the potential to enhance data analysis paradigms by seamlessly integrating human and artificial intelligence, making it more accessible, effective, and efficient for users from diverse backgrounds. The strong performance of LAMBDA in solving data analysis problems is demonstrated using real-world data examples. Videos of several case studies are available at https://xxxlambda.github.io/lambda_webpage. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.17535v2-abstract-full').style.display = 'none'; document.getElementById('2407.17535v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">51 pages, 23 figures and 6 tables</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 62-04; 62-08; 68T01; 68T09 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.17451">arXiv:2407.17451</a> <span> [<a href="https://arxiv.org/pdf/2407.17451">pdf</a>, <a href="https://arxiv.org/format/2407.17451">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> BlueTempNet: A Temporal Multi-network Dataset of Social Interactions in Bluesky Social </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jeong%2C+U">Ujun Jeong</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+B">Bohan Jiang</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+Z">Zhen Tan</a>, <a href="/search/cs?searchtype=author&query=Bernard%2C+H+R">H. Russell Bernard</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Huan Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.17451v2-abstract-short" style="display: inline;"> Decentralized social media platforms like Bluesky Social (Bluesky) have made it possible to publicly disclose some user behaviors with millisecond-level precision. Embracing Bluesky's principles of open-source and open-data, we present the first collection of the temporal dynamics of user-driven social interactions. BlueTempNet integrates multiple types of networks into a single multi-network, inc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.17451v2-abstract-full').style.display = 'inline'; document.getElementById('2407.17451v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.17451v2-abstract-full" style="display: none;"> Decentralized social media platforms like Bluesky Social (Bluesky) have made it possible to publicly disclose some user behaviors with millisecond-level precision. Embracing Bluesky's principles of open-source and open-data, we present the first collection of the temporal dynamics of user-driven social interactions. BlueTempNet integrates multiple types of networks into a single multi-network, including user-to-user interactions (following and blocking users) and user-to-community interactions (creating and joining communities). Communities are user-formed groups in custom Feeds, where users subscribe to posts aligned with their interests. Following Bluesky's public data policy, we collect existing Bluesky Feeds, including the users who liked and generated these Feeds, and provide tools to gather users' social interactions within a date range. This data-collection strategy captures past user behaviors and supports the future data collection of user behavior. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.17451v2-abstract-full').style.display = 'none'; document.getElementById('2407.17451v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted to IEEE Data Descriptions 24</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.17349">arXiv:2407.17349</a> <span> [<a href="https://arxiv.org/pdf/2407.17349">pdf</a>, <a href="https://arxiv.org/format/2407.17349">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Boosting Large Language Models with Socratic Method for Conversational Mathematics Teaching </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ding%2C+Y">Yuyang Ding</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+H">Hanglei Hu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jie Zhou</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Q">Qin Chen</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+B">Bo Jiang</a>, <a href="/search/cs?searchtype=author&query=He%2C+L">Liang He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.17349v1-abstract-short" style="display: inline;"> With the introduction of large language models (LLMs), automatic math reasoning has seen tremendous success. However, current methods primarily focus on providing solutions or using techniques like Chain-of-Thought to enhance problem-solving accuracy. In this paper, we focus on improving the capability of mathematics teaching via a Socratic teaching-based LLM (\texttt{SocraticLLM}), which guides l… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.17349v1-abstract-full').style.display = 'inline'; document.getElementById('2407.17349v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.17349v1-abstract-full" style="display: none;"> With the introduction of large language models (LLMs), automatic math reasoning has seen tremendous success. However, current methods primarily focus on providing solutions or using techniques like Chain-of-Thought to enhance problem-solving accuracy. In this paper, we focus on improving the capability of mathematics teaching via a Socratic teaching-based LLM (\texttt{SocraticLLM}), which guides learners toward profound thinking with clarity and self-discovery via conversation. We collect and release a high-quality mathematical teaching dataset, named \texttt{SocraticMATH}, which provides Socratic-style conversations of problems with extra knowledge. Also, we propose a knowledge-enhanced LLM as a strong baseline to generate reliable responses with review, guidance/heuristic, rectification, and summarization. Experimental results show the great advantages of \texttt{SocraticLLM} by comparing it with several strong generative models. The codes and datasets are available on \url{https://github.com/ECNU-ICALK/SocraticMath}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.17349v1-abstract-full').style.display = 'none'; document.getElementById('2407.17349v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted By CIKM 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.08585">arXiv:2407.08585</a> <span> [<a href="https://arxiv.org/pdf/2407.08585">pdf</a>, <a href="https://arxiv.org/format/2407.08585">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> HACMan++: Spatially-Grounded Motion Primitives for Manipulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+B">Bowen Jiang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yilin Wu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+W">Wenxuan Zhou</a>, <a href="/search/cs?searchtype=author&query=Paxton%2C+C">Chris Paxton</a>, <a href="/search/cs?searchtype=author&query=Held%2C+D">David Held</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.08585v1-abstract-short" style="display: inline;"> Although end-to-end robot learning has shown some success for robot manipulation, the learned policies are often not sufficiently robust to variations in object pose or geometry. To improve the policy generalization, we introduce spatially-grounded parameterized motion primitives in our method HACMan++. Specifically, we propose an action representation consisting of three components: what primitiv… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.08585v1-abstract-full').style.display = 'inline'; document.getElementById('2407.08585v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.08585v1-abstract-full" style="display: none;"> Although end-to-end robot learning has shown some success for robot manipulation, the learned policies are often not sufficiently robust to variations in object pose or geometry. To improve the policy generalization, we introduce spatially-grounded parameterized motion primitives in our method HACMan++. Specifically, we propose an action representation consisting of three components: what primitive type (such as grasp or push) to execute, where the primitive will be grounded (e.g. where the gripper will make contact with the world), and how the primitive motion is executed, such as parameters specifying the push direction or grasp orientation. These three components define a novel discrete-continuous action space for reinforcement learning. Our framework enables robot agents to learn to chain diverse motion primitives together and select appropriate primitive parameters to complete long-horizon manipulation tasks. By grounding the primitives on a spatial location in the environment, our method is able to effectively generalize across object shape and pose variations. Our approach significantly outperforms existing methods, particularly in complex scenarios demanding both high-level sequential reasoning and object generalization. With zero-shot sim-to-real transfer, our policy succeeds in challenging real-world manipulation tasks, with generalization to unseen objects. Videos can be found on the project website: https://sgmp-rss2024.github.io. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.08585v1-abstract-full').style.display = 'none'; document.getElementById('2407.08585v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.05550">arXiv:2407.05550</a> <span> [<a href="https://arxiv.org/pdf/2407.05550">pdf</a>, <a href="https://arxiv.org/format/2407.05550">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> MEEG and AT-DGNN: Improving EEG Emotion Recognition with Music Introducing and Graph-based Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiao%2C+M">Minghao Xiao</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Z">Zhengxi Zhu</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+K">Kang Xie</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+B">Bin Jiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.05550v4-abstract-short" style="display: inline;"> We present the MEEG dataset, a multi-modal collection of music-induced electroencephalogram (EEG) recordings designed to capture emotional responses to various musical stimuli across different valence and arousal levels. This public dataset facilitates an in-depth examination of brainwave patterns within musical contexts, providing a robust foundation for studying brain network topology during emo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.05550v4-abstract-full').style.display = 'inline'; document.getElementById('2407.05550v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.05550v4-abstract-full" style="display: none;"> We present the MEEG dataset, a multi-modal collection of music-induced electroencephalogram (EEG) recordings designed to capture emotional responses to various musical stimuli across different valence and arousal levels. This public dataset facilitates an in-depth examination of brainwave patterns within musical contexts, providing a robust foundation for studying brain network topology during emotional processing. Leveraging the MEEG dataset, we introduce the Attention-based Temporal Learner with Dynamic Graph Neural Network (AT-DGNN), a novel framework for EEG-based emotion recognition. This model combines an attention mechanism with a dynamic graph neural network (DGNN) to capture intricate EEG dynamics. The AT-DGNN achieves state-of-the-art (SOTA) performance with an accuracy of 83.74% in arousal recognition and 86.01% in valence recognition, outperforming existing SOTA methods. Comparative analysis with traditional datasets, such as DEAP, further validates the model's effectiveness and underscores the potency of music as an emotional stimulus. This study advances graph-based learning methodology in brain-computer interfaces (BCI), significantly improving the accuracy of EEG-based emotion recognition. The MEEG dataset and source code are publicly available at https://github.com/xmh1011/AT-DGNN. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.05550v4-abstract-full').style.display = 'none'; document.getElementById('2407.05550v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.03900">arXiv:2407.03900</a> <span> [<a href="https://arxiv.org/pdf/2407.03900">pdf</a>, <a href="https://arxiv.org/format/2407.03900">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Oracle Bone Inscriptions Multi-modal Dataset </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+B">Bang Li</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+D">Donghao Luo</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+Y">Yujie Liang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jing Yang</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+Z">Zengmao Ding</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+X">Xu Peng</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+B">Boyuan Jiang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+S">Shengwei Han</a>, <a href="/search/cs?searchtype=author&query=Sui%2C+D">Dan Sui</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+P">Peichao Qin</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+P">Pian Wu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chaoyang Wang</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+Y">Yun Qi</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+T">Taisong Jin</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chengjie Wang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+X">Xiaoming Huang</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+Z">Zhan Shu</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+R">Rongrong Ji</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yongge Liu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yunsheng Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.03900v1-abstract-short" style="display: inline;"> Oracle bone inscriptions(OBI) is the earliest developed writing system in China, bearing invaluable written exemplifications of early Shang history and paleography. However, the task of deciphering OBI, in the current climate of the scholarship, can prove extremely challenging. Out of the 4,500 oracle bone characters excavated, only a third have been successfully identified. Therefore, leveraging… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.03900v1-abstract-full').style.display = 'inline'; document.getElementById('2407.03900v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.03900v1-abstract-full" style="display: none;"> Oracle bone inscriptions(OBI) is the earliest developed writing system in China, bearing invaluable written exemplifications of early Shang history and paleography. However, the task of deciphering OBI, in the current climate of the scholarship, can prove extremely challenging. Out of the 4,500 oracle bone characters excavated, only a third have been successfully identified. Therefore, leveraging the advantages of advanced AI technology to assist in the decipherment of OBI is a highly essential research topic. However, fully utilizing AI's capabilities in these matters is reliant on having a comprehensive and high-quality annotated OBI dataset at hand whereas most existing datasets are only annotated in just a single or a few dimensions, limiting the value of their potential application. For instance, the Oracle-MNIST dataset only offers 30k images classified into 10 categories. Therefore, this paper proposes an Oracle Bone Inscriptions Multi-modal Dataset(OBIMD), which includes annotation information for 10,077 pieces of oracle bones. Each piece has two modalities: pixel-level aligned rubbings and facsimiles. The dataset annotates the detection boxes, character categories, transcriptions, corresponding inscription groups, and reading sequences in the groups of each oracle bone character, providing a comprehensive and high-quality level of annotations. This dataset can be used for a variety of AI-related research tasks relevant to the field of OBI, such as OBI Character Detection and Recognition, Rubbing Denoising, Character Matching, Character Generation, Reading Sequence Prediction, Missing Characters Completion task and so on. We believe that the creation and publication of a dataset like this will help significantly advance the application of AI algorithms in the field of OBI research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.03900v1-abstract-full').style.display = 'none'; document.getElementById('2407.03900v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.03876">arXiv:2407.03876</a> <span> [<a href="https://arxiv.org/pdf/2407.03876">pdf</a>, <a href="https://arxiv.org/format/2407.03876">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Automated Progressive Red Teaming </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+B">Bojian Jiang</a>, <a href="/search/cs?searchtype=author&query=Jing%2C+Y">Yi Jing</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+T">Tianhao Shen</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+T">Tong Wu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Q">Qing Yang</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+D">Deyi Xiong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.03876v2-abstract-short" style="display: inline;"> Ensuring the safety of large language models (LLMs) is paramount, yet identifying potential vulnerabilities is challenging. While manual red teaming is effective, it is time-consuming, costly and lacks scalability. Automated red teaming (ART) offers a more cost-effective alternative, automatically generating adversarial prompts to expose LLM vulnerabilities. However, in current ART efforts, a robu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.03876v2-abstract-full').style.display = 'inline'; document.getElementById('2407.03876v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.03876v2-abstract-full" style="display: none;"> Ensuring the safety of large language models (LLMs) is paramount, yet identifying potential vulnerabilities is challenging. While manual red teaming is effective, it is time-consuming, costly and lacks scalability. Automated red teaming (ART) offers a more cost-effective alternative, automatically generating adversarial prompts to expose LLM vulnerabilities. However, in current ART efforts, a robust framework is absent, which explicitly frames red teaming as an effectively learnable task. To address this gap, we propose Automated Progressive Red Teaming (APRT) as an effectively learnable framework. APRT leverages three core modules: an Intention Expanding LLM that generates diverse initial attack samples, an Intention Hiding LLM that crafts deceptive prompts, and an Evil Maker to manage prompt diversity and filter ineffective samples. The three modules collectively and progressively explore and exploit LLM vulnerabilities through multi-round interactions. In addition to the framework, we further propose a novel indicator, Attack Effectiveness Rate (AER) to mitigate the limitations of existing evaluation metrics. By measuring the likelihood of eliciting unsafe but seemingly helpful responses, AER aligns closely with human evaluations. Extensive experiments with both automatic and human evaluations, demonstrate the effectiveness of ARPT across both open- and closed-source LLMs. Specifically, APRT effectively elicits 54% unsafe yet useful responses from Meta's Llama-3-8B-Instruct, 50% from GPT-4o (API access), and 39% from Claude-3.5 (API access), showcasing its robust attack capability and transferability across LLMs (especially from open-source LLMs to closed-source LLMs). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.03876v2-abstract-full').style.display = 'none'; document.getElementById('2407.03876v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.02455">arXiv:2407.02455</a> <span> [<a href="https://arxiv.org/pdf/2407.02455">pdf</a>, <a href="https://arxiv.org/format/2407.02455">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/IoTDI61053.2024.00020">10.1109/IoTDI61053.2024.00020 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> SUPER: Seated Upper Body Pose Estimation using mmWave Radars </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Bo Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Zimeng Zhou</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+B">Boyu Jiang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+R">Rong Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.02455v1-abstract-short" style="display: inline;"> In industrial countries, adults spend a considerable amount of time sedentary each day at work, driving and during activities of daily living. Characterizing the seated upper body human poses using mmWave radars is an important, yet under-studied topic with many applications in human-machine interaction, transportation and road safety. In this work, we devise SUPER, a framework for seated upper bo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.02455v1-abstract-full').style.display = 'inline'; document.getElementById('2407.02455v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.02455v1-abstract-full" style="display: none;"> In industrial countries, adults spend a considerable amount of time sedentary each day at work, driving and during activities of daily living. Characterizing the seated upper body human poses using mmWave radars is an important, yet under-studied topic with many applications in human-machine interaction, transportation and road safety. In this work, we devise SUPER, a framework for seated upper body human pose estimation that utilizes dual-mmWave radars in close proximity. A novel masking algorithm is proposed to coherently fuse data from the radars to generate intensity and Doppler point clouds with complementary information for high-motion but small radar cross section areas (e.g., upper extremities) and low-motion but large RCS areas (e.g. torso). A lightweight neural network extracts both global and local features of upper body and output pose parameters for the Skinned Multi-Person Linear (SMPL) model. Extensive leave-one-subject-out experiments on various motion sequences from multiple subjects show that SUPER outperforms a state-of-the-art baseline method by 30 -- 184%. We also demonstrate its utility in a simple downstream task for hand-object interaction. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.02455v1-abstract-full').style.display = 'none'; document.getElementById('2407.02455v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.18572">arXiv:2406.18572</a> <span> [<a href="https://arxiv.org/pdf/2406.18572">pdf</a>, <a href="https://arxiv.org/format/2406.18572">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> GeoReasoner: Geo-localization with Reasoning in Street Views using a Large Vision-Language Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+L">Ling Li</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+Y">Yu Ye</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+B">Bingchuan Jiang</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+W">Wei Zeng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.18572v2-abstract-short" style="display: inline;"> This work tackles the problem of geo-localization with a new paradigm using a large vision-language model (LVLM) augmented with human inference knowledge. A primary challenge here is the scarcity of data for training the LVLM - existing street-view datasets often contain numerous low-quality images lacking visual clues, and lack any reasoning inference. To address the data-quality issue, we devise… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.18572v2-abstract-full').style.display = 'inline'; document.getElementById('2406.18572v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.18572v2-abstract-full" style="display: none;"> This work tackles the problem of geo-localization with a new paradigm using a large vision-language model (LVLM) augmented with human inference knowledge. A primary challenge here is the scarcity of data for training the LVLM - existing street-view datasets often contain numerous low-quality images lacking visual clues, and lack any reasoning inference. To address the data-quality issue, we devise a CLIP-based network to quantify the degree of street-view images being locatable, leading to the creation of a new dataset comprising highly locatable street views. To enhance reasoning inference, we integrate external knowledge obtained from real geo-localization games, tapping into valuable human inference capabilities. The data are utilized to train GeoReasoner, which undergoes fine-tuning through dedicated reasoning and location-tuning stages. Qualitative and quantitative evaluations illustrate that GeoReasoner outperforms counterpart LVLMs by more than 25% at country-level and 38% at city-level geo-localization tasks, and surpasses StreetCLIP performance while requiring fewer training resources. The data and code are available at https://github.com/lingli1996/GeoReasoner. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.18572v2-abstract-full').style.display = 'none'; document.getElementById('2406.18572v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICML 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.17992">arXiv:2406.17992</a> <span> [<a href="https://arxiv.org/pdf/2406.17992">pdf</a>, <a href="https://arxiv.org/format/2406.17992">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Catching Chameleons: Detecting Evolving Disinformation Generated using Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+B">Bohan Jiang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+C">Chengshuai Zhao</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+Z">Zhen Tan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Huan Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.17992v1-abstract-short" style="display: inline;"> Despite recent advancements in detecting disinformation generated by large language models (LLMs), current efforts overlook the ever-evolving nature of this disinformation. In this work, we investigate a challenging yet practical research problem of detecting evolving LLM-generated disinformation. Disinformation evolves constantly through the rapid development of LLMs and their variants. As a cons… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.17992v1-abstract-full').style.display = 'inline'; document.getElementById('2406.17992v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.17992v1-abstract-full" style="display: none;"> Despite recent advancements in detecting disinformation generated by large language models (LLMs), current efforts overlook the ever-evolving nature of this disinformation. In this work, we investigate a challenging yet practical research problem of detecting evolving LLM-generated disinformation. Disinformation evolves constantly through the rapid development of LLMs and their variants. As a consequence, the detection model faces significant challenges. First, it is inefficient to train separate models for each disinformation generator. Second, the performance decreases in scenarios when evolving LLM-generated disinformation is encountered in sequential order. To address this problem, we propose DELD (Detecting Evolving LLM-generated Disinformation), a parameter-efficient approach that jointly leverages the general fact-checking capabilities of pre-trained language models (PLM) and the independent disinformation generation characteristics of various LLMs. In particular, the learned characteristics are concatenated sequentially to facilitate knowledge accumulation and transformation. DELD addresses the issue of label scarcity by integrating the semantic embeddings of disinformation with trainable soft prompts to elicit model-specific knowledge. Our experiments show that \textit{DELD} significantly outperforms state-of-the-art methods. Moreover, our method provides critical insights into the unique patterns of disinformation generation across different LLMs, offering valuable perspectives in this line of research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.17992v1-abstract-full').style.display = 'none'; document.getElementById('2406.17992v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.17518">arXiv:2406.17518</a> <span> [<a href="https://arxiv.org/pdf/2406.17518">pdf</a>, <a href="https://arxiv.org/format/2406.17518">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Explainability of Knowledge Learning Paths: Causal Knowledge Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wei%2C+Y">Yuang Wei</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yizhou Zhou</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Y">Yuan-Hao Jiang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+B">Bo Jiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.17518v2-abstract-short" style="display: inline;"> A reliable knowledge structure is a prerequisite for building effective adaptive learning systems and intelligent tutoring systems. Pursuing an explainable and trustworthy knowledge structure, we propose a method for constructing causal knowledge networks. This approach leverages Bayesian networks as a foundation and incorporates causal relationship analysis to derive a causal network. Additionall… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.17518v2-abstract-full').style.display = 'inline'; document.getElementById('2406.17518v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.17518v2-abstract-full" style="display: none;"> A reliable knowledge structure is a prerequisite for building effective adaptive learning systems and intelligent tutoring systems. Pursuing an explainable and trustworthy knowledge structure, we propose a method for constructing causal knowledge networks. This approach leverages Bayesian networks as a foundation and incorporates causal relationship analysis to derive a causal network. Additionally, we introduce a dependable knowledge-learning path recommendation technique built upon this framework, improving teaching and learning quality while maintaining transparency in the decision-making process. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.17518v2-abstract-full').style.display = 'none'; document.getElementById('2406.17518v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 3 figures, Educational Data Mining 2024, Human-Centric eXplainable AI in Education</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.17238">arXiv:2406.17238</a> <span> [<a href="https://arxiv.org/pdf/2406.17238">pdf</a>, <a href="https://arxiv.org/format/2406.17238">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Generative Expansion of Small Datasets: An Expansive Graph Approach </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jebraeeli%2C+V">Vahid Jebraeeli</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+B">Bo Jiang</a>, <a href="/search/cs?searchtype=author&query=Krim%2C+H">Hamid Krim</a>, <a href="/search/cs?searchtype=author&query=Cansever%2C+D">Derya Cansever</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.17238v2-abstract-short" style="display: inline;"> Limited data availability in machine learning significantly impacts performance and generalization. Traditional augmentation methods enhance moderately sufficient datasets. GANs struggle with convergence when generating diverse samples. Diffusion models, while effective, have high computational costs. We introduce an Expansive Synthesis model generating large-scale, information-rich datasets from… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.17238v2-abstract-full').style.display = 'inline'; document.getElementById('2406.17238v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.17238v2-abstract-full" style="display: none;"> Limited data availability in machine learning significantly impacts performance and generalization. Traditional augmentation methods enhance moderately sufficient datasets. GANs struggle with convergence when generating diverse samples. Diffusion models, while effective, have high computational costs. We introduce an Expansive Synthesis model generating large-scale, information-rich datasets from minimal samples. It uses expander graph mappings and feature interpolation to preserve data distribution and feature relationships. The model leverages neural networks' non-linear latent space, captured by a Koopman operator, to create a linear feature space for dataset expansion. An autoencoder with self-attention layers and optimal transport refines distributional consistency. We validate by comparing classifiers trained on generated data to those trained on original datasets. Results show comparable performance, demonstrating the model's potential to augment training data effectively. This work advances data generation, addressing scarcity in machine learning applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.17238v2-abstract-full').style.display = 'none'; document.getElementById('2406.17238v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 3 figures and 2 tables. Under review in ICASSP 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.14846">arXiv:2406.14846</a> <span> [<a href="https://arxiv.org/pdf/2406.14846">pdf</a>, <a href="https://arxiv.org/format/2406.14846">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Graph Edge Representation via Tensor Product Graph Convolutional Representation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+B">Bo Jiang</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+S">Sheng Ge</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Ziyan Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Beibei Wang</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+J">Jin Tang</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+B">Bin Luo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.14846v1-abstract-short" style="display: inline;"> Graph Convolutional Networks (GCNs) have been widely studied. The core of GCNs is the definition of convolution operators on graphs. However, existing Graph Convolution (GC) operators are mainly defined on adjacency matrix and node features and generally focus on obtaining effective node embeddings which cannot be utilized to address the graphs with (high-dimensional) edge features. To address thi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.14846v1-abstract-full').style.display = 'inline'; document.getElementById('2406.14846v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.14846v1-abstract-full" style="display: none;"> Graph Convolutional Networks (GCNs) have been widely studied. The core of GCNs is the definition of convolution operators on graphs. However, existing Graph Convolution (GC) operators are mainly defined on adjacency matrix and node features and generally focus on obtaining effective node embeddings which cannot be utilized to address the graphs with (high-dimensional) edge features. To address this problem, by leveraging tensor contraction representation and tensor product graph diffusion theories, this paper analogously defines an effective convolution operator on graphs with edge features which is named as Tensor Product Graph Convolution (TPGC). The proposed TPGC aims to obtain effective edge embeddings. It provides a complementary model to traditional graph convolutions (GCs) to address the more general graph data analysis with both node and edge features. Experimental results on several graph learning tasks demonstrate the effectiveness of the proposed TPGC. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.14846v1-abstract-full').style.display = 'none'; document.getElementById('2406.14846v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.12896">arXiv:2406.12896</a> <span> [<a href="https://arxiv.org/pdf/2406.12896">pdf</a>, <a href="https://arxiv.org/format/2406.12896">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Leveraging Pedagogical Theories to Understand Student Learning Process with Graph-based Reasonable Knowledge Tracing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cui%2C+J">Jiajun Cui</a>, <a href="/search/cs?searchtype=author&query=Qian%2C+H">Hong Qian</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+B">Bo Jiang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wei Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.12896v1-abstract-short" style="display: inline;"> Knowledge tracing (KT) is a crucial task in intelligent education, focusing on predicting students' performance on given questions to trace their evolving knowledge. The advancement of deep learning in this field has led to deep-learning knowledge tracing (DLKT) models that prioritize high predictive accuracy. However, many existing DLKT methods overlook the fundamental goal of tracking students'… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.12896v1-abstract-full').style.display = 'inline'; document.getElementById('2406.12896v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.12896v1-abstract-full" style="display: none;"> Knowledge tracing (KT) is a crucial task in intelligent education, focusing on predicting students' performance on given questions to trace their evolving knowledge. The advancement of deep learning in this field has led to deep-learning knowledge tracing (DLKT) models that prioritize high predictive accuracy. However, many existing DLKT methods overlook the fundamental goal of tracking students' dynamical knowledge mastery. These models do not explicitly model knowledge mastery tracing processes or yield unreasonable results that educators find difficulty to comprehend and apply in real teaching scenarios. In response, our research conducts a preliminary analysis of mainstream KT approaches to highlight and explain such unreasonableness. We introduce GRKT, a graph-based reasonable knowledge tracing method to address these issues. By leveraging graph neural networks, our approach delves into the mutual influences of knowledge concepts, offering a more accurate representation of how the knowledge mastery evolves throughout the learning process. Additionally, we propose a fine-grained and psychological three-stage modeling process as knowledge retrieval, memory strengthening, and knowledge learning/forgetting, to conduct a more reasonable knowledge tracing process. Comprehensive experiments demonstrate that GRKT outperforms eleven baselines across three datasets, not only enhancing predictive accuracy but also generating more reasonable knowledge tracing results. This makes our model a promising advancement for practical implementation in educational settings. The source code is available at https://github.com/JJCui96/GRKT. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.12896v1-abstract-full').style.display = 'none'; document.getElementById('2406.12896v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Preprint, accepted to appear in SIGKDD 2024, 12 pages. The source code is available at https://github.com/JJCui96/GRKT. Keywords: interpretable knowledge tracing, student behavior modeling, intelligence education</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Jiang%2C+B&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Jiang%2C+B&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Jiang%2C+B&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Jiang%2C+B&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Jiang%2C+B&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Jiang%2C+B&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li> <a href="/search/?searchtype=author&query=Jiang%2C+B&start=250" class="pagination-link " aria-label="Page 6" aria-current="page">6 </a> </li> <li> <a href="/search/?searchtype=author&query=Jiang%2C+B&start=300" class="pagination-link " aria-label="Page 7" aria-current="page">7 </a> </li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository