Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 259 results for author: <span class="mathjax">Yan, M</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Yan%2C+M">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Yan, M"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Yan%2C+M&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Yan, M"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Yan%2C+M&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Yan%2C+M&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Yan%2C+M&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Yan%2C+M&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Yan%2C+M&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Yan%2C+M&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li> <a href="/search/?searchtype=author&query=Yan%2C+M&start=250" class="pagination-link " aria-label="Page 6" aria-current="page">6 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.07135">arXiv:2412.07135</a> <span> [<a href="https://arxiv.org/pdf/2412.07135">pdf</a>, <a href="https://arxiv.org/format/2412.07135">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> </div> </div> <p class="title is-5 mathjax"> Oreo: Protecting ASLR Against Microarchitectural Attacks (Extended Version) </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Song%2C+S">Shixin Song</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Joseph Zhang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+M">Mengjia Yan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.07135v1-abstract-short" style="display: inline;"> Address Space Layout Randomization (ASLR) is one of the most prominently deployed mitigations against memory corruption attacks. ASLR randomly shuffles program virtual addresses to prevent attackers from knowing the location of program contents in memory. Microarchitectural side channels have been shown to defeat ASLR through various hardware mechanisms. We systematically analyze existing microarc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.07135v1-abstract-full').style.display = 'inline'; document.getElementById('2412.07135v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.07135v1-abstract-full" style="display: none;"> Address Space Layout Randomization (ASLR) is one of the most prominently deployed mitigations against memory corruption attacks. ASLR randomly shuffles program virtual addresses to prevent attackers from knowing the location of program contents in memory. Microarchitectural side channels have been shown to defeat ASLR through various hardware mechanisms. We systematically analyze existing microarchitectural attacks and identify multiple leakage paths. Given the vast attack surface exposed by ASLR, it is challenging to effectively prevent leaking the ASLR secret against microarchitectural attacks. Motivated by this, we present Oreo, a software-hardware co-design mitigation that strengthens ASLR against these attacks. Oreo uses a new memory mapping interface to remove secret randomized bits in virtual addresses before translating them to their corresponding physical addresses. This extra step hides randomized virtual addresses from microarchitecture structures, preventing side channels from leaking ASLR secrets. Oreo is transparent to user programs and incurs low overhead. We prototyped and evaluated our design on Linux using the hardware simulator gem5. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.07135v1-abstract-full').style.display = 'none'; document.getElementById('2412.07135v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper has been accepted to NDSS 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.03347">arXiv:2412.03347</a> <span> [<a href="https://arxiv.org/pdf/2412.03347">pdf</a>, <a href="https://arxiv.org/format/2412.03347">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> DIVE: Taming DINO for Subject-Driven Video Editing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yi Huang</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+W">Wei Xiong</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">He Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C">Chaoqi Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jianzhuang Liu</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+M">Mingfu Yan</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Shifeng Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.03347v1-abstract-short" style="display: inline;"> Building on the success of diffusion models in image generation and editing, video editing has recently gained substantial attention. However, maintaining temporal consistency and motion alignment still remains challenging. To address these issues, this paper proposes DINO-guided Video Editing (DIVE), a framework designed to facilitate subject-driven editing in source videos conditioned on either… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.03347v1-abstract-full').style.display = 'inline'; document.getElementById('2412.03347v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.03347v1-abstract-full" style="display: none;"> Building on the success of diffusion models in image generation and editing, video editing has recently gained substantial attention. However, maintaining temporal consistency and motion alignment still remains challenging. To address these issues, this paper proposes DINO-guided Video Editing (DIVE), a framework designed to facilitate subject-driven editing in source videos conditioned on either target text prompts or reference images with specific identities. The core of DIVE lies in leveraging the powerful semantic features extracted from a pretrained DINOv2 model as implicit correspondences to guide the editing process. Specifically, to ensure temporal motion consistency, DIVE employs DINO features to align with the motion trajectory of the source video. Extensive experiments on diverse real-world videos demonstrate that our framework can achieve high-quality editing results with robust motion consistency, highlighting the potential of DINO to contribute to video editing. For precise subject editing, DIVE incorporates the DINO features of reference images into a pretrained text-to-image model to learn Low-Rank Adaptations (LoRAs), effectively registering the target subject's identity. Project page: https://dino-video-editing.github.io <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.03347v1-abstract-full').style.display = 'none'; document.getElementById('2412.03347v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11909">arXiv:2411.11909</a> <span> [<a href="https://arxiv.org/pdf/2411.11909">pdf</a>, <a href="https://arxiv.org/format/2411.11909">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SymDPO: Boosting In-Context Learning of Large Multimodal Models with Symbol Demonstration Direct Preference Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jia%2C+H">Hongrui Jia</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+C">Chaoya Jiang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+H">Haiyang Xu</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+W">Wei Ye</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+M">Mengfan Dong</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+M">Ming Yan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Ji Zhang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+F">Fei Huang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shikun Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11909v2-abstract-short" style="display: inline;"> As language models continue to scale, Large Language Models (LLMs) have exhibited emerging capabilities in In-Context Learning (ICL), enabling them to solve language tasks by prefixing a few in-context demonstrations (ICDs) as context. Inspired by these advancements, researchers have extended these techniques to develop Large Multimodal Models (LMMs) with ICL capabilities. However, existing LMMs f… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11909v2-abstract-full').style.display = 'inline'; document.getElementById('2411.11909v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11909v2-abstract-full" style="display: none;"> As language models continue to scale, Large Language Models (LLMs) have exhibited emerging capabilities in In-Context Learning (ICL), enabling them to solve language tasks by prefixing a few in-context demonstrations (ICDs) as context. Inspired by these advancements, researchers have extended these techniques to develop Large Multimodal Models (LMMs) with ICL capabilities. However, existing LMMs face a critical issue: they often fail to effectively leverage the visual context in multimodal demonstrations and instead simply follow textual patterns. This indicates that LMMs do not achieve effective alignment between multimodal demonstrations and model outputs. To address this problem, we propose Symbol Demonstration Direct Preference Optimization (SymDPO). Specifically, SymDPO aims to break the traditional paradigm of constructing multimodal demonstrations by using random symbols to replace text answers within instances. This forces the model to carefully understand the demonstration images and establish a relationship between the images and the symbols to answer questions correctly. We validate the effectiveness of this method on multiple benchmarks, demonstrating that with SymDPO, LMMs can more effectively understand the multimodal context within examples and utilize this knowledge to answer questions better. Code is available at https://github.com/APiaoG/SymDPO. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11909v2-abstract-full').style.display = 'none'; document.getElementById('2411.11909v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00850">arXiv:2411.00850</a> <span> [<a href="https://arxiv.org/pdf/2411.00850">pdf</a>, <a href="https://arxiv.org/format/2411.00850">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> GWQ: Gradient-Aware Weight Quantization for Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shao%2C+Y">Yihua Shao</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+S">Siyu Liang</a>, <a href="/search/cs?searchtype=author&query=Ling%2C+Z">Zijian Ling</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+M">Minxi Yan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Haiyang Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Siyu Chen</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+Z">Ziyang Yan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chenyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+H">Haotong Qin</a>, <a href="/search/cs?searchtype=author&query=Magno%2C+M">Michele Magno</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yang Yang</a>, <a href="/search/cs?searchtype=author&query=Lei%2C+Z">Zhen Lei</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yan Wang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+J">Jingcai Guo</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+L">Ling Shao</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+H">Hao Tang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00850v2-abstract-short" style="display: inline;"> Large language models (LLMs) show impressive performance in solving complex language tasks. However, its large number of parameters present significant challenges for the deployment and application of the model on edge devices. Compressing large language models to low bits can enable them to run on resource-constrained devices, often leading to performance degradation. To address this problem, we… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00850v2-abstract-full').style.display = 'inline'; document.getElementById('2411.00850v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00850v2-abstract-full" style="display: none;"> Large language models (LLMs) show impressive performance in solving complex language tasks. However, its large number of parameters present significant challenges for the deployment and application of the model on edge devices. Compressing large language models to low bits can enable them to run on resource-constrained devices, often leading to performance degradation. To address this problem, we propose gradient-aware weight quantization (GWQ), the first quantization approach for low-bit weight quantization that leverages gradients to localize outliers, requiring only a minimal amount of calibration data for outlier detection. GWQ retains the weights corresponding to the top 1% outliers preferentially at FP16 precision, while the remaining non-outlier weights are stored in a low-bit format. GWQ found experimentally that utilizing the sensitive weights in the gradient localization model is more scientific compared to utilizing the sensitive weights in the Hessian matrix localization model. Compared to current quantization methods, GWQ can be applied to multiple language models and achieves lower PPL on the WikiText2 and C4 dataset. In the zero-shot task, GWQ quantized models have higher accuracy compared to other quantization methods. GWQ is also suitable for multimodal model quantization, and the quantized Qwen-VL family model is more accurate than other methods. Zero-shot target detection task dataset RefCOCO outperforms the current stat-of-the-arts method SPQR. GWQ achieves 1.2 times inference speedup in comparison to the original model, and effectively reduces the inference memory. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00850v2-abstract-full').style.display = 'none'; document.getElementById('2411.00850v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23136">arXiv:2410.23136</a> <span> [<a href="https://arxiv.org/pdf/2410.23136">pdf</a>, <a href="https://arxiv.org/format/2410.23136">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Real-Time Personalization for LLM-based Recommendation with Customized In-Context Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bao%2C+K">Keqin Bao</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+M">Ming Yan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yang Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jizhi Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wenjie Wang</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+F">Fuli Feng</a>, <a href="/search/cs?searchtype=author&query=He%2C+X">Xiangnan He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23136v1-abstract-short" style="display: inline;"> Frequently updating Large Language Model (LLM)-based recommender systems to adapt to new user interests -- as done for traditional ones -- is impractical due to high training costs, even with acceleration methods. This work explores adapting to dynamic user interests without any model updates by leveraging In-Context Learning (ICL), which allows LLMs to learn new tasks from few-shot examples provi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23136v1-abstract-full').style.display = 'inline'; document.getElementById('2410.23136v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23136v1-abstract-full" style="display: none;"> Frequently updating Large Language Model (LLM)-based recommender systems to adapt to new user interests -- as done for traditional ones -- is impractical due to high training costs, even with acceleration methods. This work explores adapting to dynamic user interests without any model updates by leveraging In-Context Learning (ICL), which allows LLMs to learn new tasks from few-shot examples provided in the input. Using new-interest examples as the ICL few-shot examples, LLMs may learn real-time interest directly, avoiding the need for model updates. However, existing LLM-based recommenders often lose the in-context learning ability during recommendation tuning, while the original LLM's in-context learning lacks recommendation-specific focus. To address this, we propose RecICL, which customizes recommendation-specific in-context learning for real-time recommendations. RecICL organizes training examples in an in-context learning format, ensuring that in-context learning ability is preserved and aligned with the recommendation task during tuning. Extensive experiments demonstrate RecICL's effectiveness in delivering real-time recommendations without requiring model updates. Our code is available at https://github.com/ym689/rec_icl. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23136v1-abstract-full').style.display = 'none'; document.getElementById('2410.23136v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20750">arXiv:2410.20750</a> <span> [<a href="https://arxiv.org/pdf/2410.20750">pdf</a>, <a href="https://arxiv.org/format/2410.20750">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> ODRL: A Benchmark for Off-Dynamics Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lyu%2C+J">Jiafei Lyu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+K">Kang Xu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jiacheng Xu</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+M">Mengbei Yan</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jingwen Yang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zongzhang Zhang</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+C">Chenjia Bai</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Z">Zongqing Lu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiu Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20750v1-abstract-short" style="display: inline;"> We consider off-dynamics reinforcement learning (RL) where one needs to transfer policies across different domains with dynamics mismatch. Despite the focus on developing dynamics-aware algorithms, this field is hindered due to the lack of a standard benchmark. To bridge this gap, we introduce ODRL, the first benchmark tailored for evaluating off-dynamics RL methods. ODRL contains four experimenta… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20750v1-abstract-full').style.display = 'inline'; document.getElementById('2410.20750v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20750v1-abstract-full" style="display: none;"> We consider off-dynamics reinforcement learning (RL) where one needs to transfer policies across different domains with dynamics mismatch. Despite the focus on developing dynamics-aware algorithms, this field is hindered due to the lack of a standard benchmark. To bridge this gap, we introduce ODRL, the first benchmark tailored for evaluating off-dynamics RL methods. ODRL contains four experimental settings where the source and target domains can be either online or offline, and provides diverse tasks and a broad spectrum of dynamics shifts, making it a reliable platform to comprehensively evaluate the agent's adaptation ability to the target domain. Furthermore, ODRL includes recent off-dynamics RL algorithms in a unified framework and introduces some extra baselines for different settings, all implemented in a single-file manner. To unpack the true adaptation capability of existing methods, we conduct extensive benchmarking experiments, which show that no method has universal advantages across varied dynamics shifts. We hope this benchmark can serve as a cornerstone for future research endeavors. Our code is publicly available at https://github.com/OffDynamicsRL/off-dynamics-rl. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20750v1-abstract-full').style.display = 'none'; document.getElementById('2410.20750v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2024 D&B Track</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18479">arXiv:2410.18479</a> <span> [<a href="https://arxiv.org/pdf/2410.18479">pdf</a>, <a href="https://arxiv.org/format/2410.18479">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3671016.3671388">10.1145/3671016.3671388 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> DFEPT: Data Flow Embedding for Enhancing Pre-Trained Model Based Vulnerability Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Zhonghao Jiang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+W">Weifeng Sun</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+X">Xiaoyan Gu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jiaxin Wu</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+T">Tao Wen</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+H">Haibo Hu</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+M">Meng Yan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18479v1-abstract-short" style="display: inline;"> Software vulnerabilities represent one of the most pressing threats to computing systems. Identifying vulnerabilities in source code is crucial for protecting user privacy and reducing economic losses. Traditional static analysis tools rely on experts with knowledge in security to manually build rules for operation, a process that requires substantial time and manpower costs and also faces challen… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18479v1-abstract-full').style.display = 'inline'; document.getElementById('2410.18479v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18479v1-abstract-full" style="display: none;"> Software vulnerabilities represent one of the most pressing threats to computing systems. Identifying vulnerabilities in source code is crucial for protecting user privacy and reducing economic losses. Traditional static analysis tools rely on experts with knowledge in security to manually build rules for operation, a process that requires substantial time and manpower costs and also faces challenges in adapting to new vulnerabilities. The emergence of pre-trained code language models has provided a new solution for automated vulnerability detection. However, code pre-training models are typically based on token-level large-scale pre-training, which hampers their ability to effectively capture the structural and dependency relationships among code segments. In the context of software vulnerabilities, certain types of vulnerabilities are related to the dependency relationships within the code. Consequently, identifying and analyzing these vulnerability samples presents a significant challenge for pre-trained models. In this paper, we propose a data flow embedding technique to enhance the performance of pre-trained models in vulnerability detection tasks, named DFEPT, which provides effective vulnerability data flow information to pre-trained models. Specifically, we parse data flow graphs from function-level source code, and use the data type of the variable as the node characteristics of the DFG. By applying graph learning techniques, we embed the data flow graph and incorporate relative positional information into the graph embedding using sine positional encoding to ensure the completeness of vulnerability data flow information. Our research shows that DFEPT can provide effective vulnerability semantic information to pre-trained models, achieving an accuracy of 64.97% on the Devign dataset and an F1-Score of 47.9% on the Reveal dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18479v1-abstract-full').style.display = 'none'; document.getElementById('2410.18479v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18368">arXiv:2410.18368</a> <span> [<a href="https://arxiv.org/pdf/2410.18368">pdf</a>, <a href="https://arxiv.org/format/2410.18368">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> </div> </div> <p class="title is-5 mathjax"> Multi-objective Optimization in CPU Design Space Exploration: Attention is All You Need </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xue%2C+R">Runzhen Xue</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+H">Hao Wu</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+M">Mingyu Yan</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+Z">Ziheng Xiao</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+X">Xiaochun Ye</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+D">Dongrui Fan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18368v1-abstract-short" style="display: inline;"> Design space exploration (DSE) enables architects to systematically evaluate various design options, guiding decisions on the most suitable configurations to meet specific objectives such as optimizing performance, power, and area. However, the growing complexity of modern CPUs has dramatically increased the number of micro-architectural parameters and expanded the overall design space, making DSE… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18368v1-abstract-full').style.display = 'inline'; document.getElementById('2410.18368v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18368v1-abstract-full" style="display: none;"> Design space exploration (DSE) enables architects to systematically evaluate various design options, guiding decisions on the most suitable configurations to meet specific objectives such as optimizing performance, power, and area. However, the growing complexity of modern CPUs has dramatically increased the number of micro-architectural parameters and expanded the overall design space, making DSE more challenging and time-consuming. Existing DSE frameworks struggle in large-scale design spaces due to inaccurate models and limited insights into parameter impact, hindering efficient identification of optimal micro-architectures within tight timeframes. In this work, we introduce AttentionDSE. Its key idea is to use the attention mechanism to establish a direct mapping of micro-architectural parameters to their contributions to predicted performance. This approach enhances both the prediction accuracy and interpretability of the performance model. Furthermore, the weights are dynamically adjusted, enabling the model to respond to design changes and effectively pinpoint the key micro-architectural parameters/components responsible for performance bottlenecks. Thus, AttentionDSE accurately, purposefully, and rapidly discovers optimal designs. Experiments on SPEC 2017 demonstrate that AttentionDSE significantly reduces exploration time by over 80\% and achieves 3.9\% improvement in Pareto Hypervolume compared to state-of-the-art DSE frameworks while maintaining superior prediction accuracy and efficiency with an increasing number of parameters. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18368v1-abstract-full').style.display = 'none'; document.getElementById('2410.18368v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.17565">arXiv:2410.17565</a> <span> [<a href="https://arxiv.org/pdf/2410.17565">pdf</a>, <a href="https://arxiv.org/format/2410.17565">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Double Banking on Knowledge: Customized Modulation and Prototypes for Multi-Modality Semi-supervised Medical Image Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yingyu Chen</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Ziyuan Yang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+M">Ming Yan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhongzhou Zhang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+H">Hui Yu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yan Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yi Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.17565v1-abstract-short" style="display: inline;"> Multi-modality (MM) semi-supervised learning (SSL) based medical image segmentation has recently gained increasing attention for its ability to utilize MM data and reduce reliance on labeled images. However, current methods face several challenges: (1) Complex network designs hinder scalability to scenarios with more than two modalities. (2) Focusing solely on modality-invariant representation whi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17565v1-abstract-full').style.display = 'inline'; document.getElementById('2410.17565v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.17565v1-abstract-full" style="display: none;"> Multi-modality (MM) semi-supervised learning (SSL) based medical image segmentation has recently gained increasing attention for its ability to utilize MM data and reduce reliance on labeled images. However, current methods face several challenges: (1) Complex network designs hinder scalability to scenarios with more than two modalities. (2) Focusing solely on modality-invariant representation while neglecting modality-specific features, leads to incomplete MM learning. (3) Leveraging unlabeled data with generative methods can be unreliable for SSL. To address these problems, we propose Double Bank Dual Consistency (DBDC), a novel MM-SSL approach for medical image segmentation. To address challenge (1), we propose a modality all-in-one segmentation network that accommodates data from any number of modalities, removing the limitation on modality count. To address challenge (2), we design two learnable plug-in banks, Modality-Level Modulation bank (MLMB) and Modality-Level Prototype (MLPB) bank, to capture both modality-invariant and modality-specific knowledge. These banks are updated using our proposed Modality Prototype Contrastive Learning (MPCL). Additionally, we design Modality Adaptive Weighting (MAW) to dynamically adjust learning weights for each modality, ensuring balanced MM learning as different modalities learn at different rates. Finally, to address challenge (3), we introduce a Dual Consistency (DC) strategy that enforces consistency at both the image and feature levels without relying on generative methods. We evaluate our method on a 2-to-4 modality segmentation task using three open-source datasets, and extensive experiments show that our method outperforms state-of-the-art approaches. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17565v1-abstract-full').style.display = 'none'; document.getElementById('2410.17565v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.17269">arXiv:2410.17269</a> <span> [<a href="https://arxiv.org/pdf/2410.17269">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> FairFML: Fair Federated Machine Learning with a Case Study on Reducing Gender Disparities in Cardiac Arrest Outcome Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+S">Siqi Li</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Q">Qiming Wu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xin Li</a>, <a href="/search/cs?searchtype=author&query=Miao%2C+D">Di Miao</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+C">Chuan Hong</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+W">Wenjun Gu</a>, <a href="/search/cs?searchtype=author&query=Shang%2C+Y">Yuqing Shang</a>, <a href="/search/cs?searchtype=author&query=Okada%2C+Y">Yohei Okada</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+M+H">Michael Hao Chen</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+M">Mengying Yan</a>, <a href="/search/cs?searchtype=author&query=Ning%2C+Y">Yilin Ning</a>, <a href="/search/cs?searchtype=author&query=Ong%2C+M+E+H">Marcus Eng Hock Ong</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+N">Nan Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.17269v1-abstract-short" style="display: inline;"> Objective: Mitigating algorithmic disparities is a critical challenge in healthcare research, where ensuring equity and fairness is paramount. While large-scale healthcare data exist across multiple institutions, cross-institutional collaborations often face privacy constraints, highlighting the need for privacy-preserving solutions that also promote fairness. Materials and Methods: In this stud… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17269v1-abstract-full').style.display = 'inline'; document.getElementById('2410.17269v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.17269v1-abstract-full" style="display: none;"> Objective: Mitigating algorithmic disparities is a critical challenge in healthcare research, where ensuring equity and fairness is paramount. While large-scale healthcare data exist across multiple institutions, cross-institutional collaborations often face privacy constraints, highlighting the need for privacy-preserving solutions that also promote fairness. Materials and Methods: In this study, we present Fair Federated Machine Learning (FairFML), a model-agnostic solution designed to reduce algorithmic bias in cross-institutional healthcare collaborations while preserving patient privacy. As a proof of concept, we validated FairFML using a real-world clinical case study focused on reducing gender disparities in cardiac arrest outcome prediction. Results: We demonstrate that the proposed FairFML framework enhances fairness in federated learning (FL) models without compromising predictive performance. Our findings show that FairFML improves model fairness by up to 65% compared to the centralized model, while maintaining performance comparable to both local and centralized models, as measured by receiver operating characteristic analysis. Discussion and Conclusion: FairFML offers a promising and flexible solution for FL collaborations, with its adaptability allowing seamless integration with various FL frameworks and models, from traditional statistical methods to deep learning techniques. This makes FairFML a robust approach for developing fairer FL models across diverse clinical and biomedical applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17269v1-abstract-full').style.display = 'none'; document.getElementById('2410.17269v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.13331">arXiv:2410.13331</a> <span> [<a href="https://arxiv.org/pdf/2410.13331">pdf</a>, <a href="https://arxiv.org/format/2410.13331">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Improving Discrete Optimisation Via Decoupled Straight-Through Gumbel-Softmax </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shah%2C+R">Rushi Shah</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+M">Mingyuan Yan</a>, <a href="/search/cs?searchtype=author&query=Mozer%2C+M+C">Michael Curtis Mozer</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+D">Dianbo Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.13331v1-abstract-short" style="display: inline;"> Discrete representations play a crucial role in many deep learning architectures, yet their non-differentiable nature poses significant challenges for gradient-based optimization. To address this issue, various gradient estimators have been developed, including the Straight-Through Gumbel-Softmax (ST-GS) estimator, which combines the Straight-Through Estimator (STE) and the Gumbel-based reparamete… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13331v1-abstract-full').style.display = 'inline'; document.getElementById('2410.13331v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.13331v1-abstract-full" style="display: none;"> Discrete representations play a crucial role in many deep learning architectures, yet their non-differentiable nature poses significant challenges for gradient-based optimization. To address this issue, various gradient estimators have been developed, including the Straight-Through Gumbel-Softmax (ST-GS) estimator, which combines the Straight-Through Estimator (STE) and the Gumbel-based reparameterization trick. However, the performance of ST-GS is highly sensitive to temperature, with its selection often compromising gradient fidelity. In this work, we propose a simple yet effective extension to ST-GS by employing decoupled temperatures for forward and backward passes, which we refer to as "Decoupled ST-GS". We show that our approach significantly enhances the original ST-GS through extensive experiments across multiple tasks and datasets. We further investigate the impact of our method on gradient fidelity from multiple perspectives, including the gradient gap and the bias-variance trade-off of estimated gradients. Our findings contribute to the ongoing effort to improve discrete optimization in deep learning, offering a practical solution that balances simplicity and effectiveness. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13331v1-abstract-full').style.display = 'none'; document.getElementById('2410.13331v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.10180">arXiv:2410.10180</a> <span> [<a href="https://arxiv.org/pdf/2410.10180">pdf</a>, <a href="https://arxiv.org/format/2410.10180">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Gaussian Mixture Vector Quantization with Aggregated Categorical Posterior </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yan%2C+M">Mingyuan Yan</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jiawei Wu</a>, <a href="/search/cs?searchtype=author&query=Shah%2C+R">Rushi Shah</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+D">Dianbo Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.10180v1-abstract-short" style="display: inline;"> The vector quantization is a widely used method to map continuous representation to discrete space and has important application in tokenization for generative mode, bottlenecking information and many other tasks in machine learning. Vector Quantized Variational Autoencoder (VQ-VAE) is a type of variational autoencoder using discrete embedding as latent. We generalize the technique further, enrich… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10180v1-abstract-full').style.display = 'inline'; document.getElementById('2410.10180v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.10180v1-abstract-full" style="display: none;"> The vector quantization is a widely used method to map continuous representation to discrete space and has important application in tokenization for generative mode, bottlenecking information and many other tasks in machine learning. Vector Quantized Variational Autoencoder (VQ-VAE) is a type of variational autoencoder using discrete embedding as latent. We generalize the technique further, enriching the probabilistic framework with a Gaussian mixture as the underlying generative model. This framework leverages a codebook of latent means and adaptive variances to capture complex data distributions. This principled framework avoids various heuristics and strong assumptions that are needed with the VQ-VAE to address training instability and to improve codebook utilization. This approach integrates the benefits of both discrete and continuous representations within a variational Bayesian framework. Furthermore, by introducing the \textit{Aggregated Categorical Posterior Evidence Lower Bound} (ALBO), we offer a principled alternative optimization objective that aligns variational distributions with the generative model. Our experiments demonstrate that GM-VQ improves codebook utilization and reduces information loss without relying on handcrafted heuristics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10180v1-abstract-full').style.display = 'none'; document.getElementById('2410.10180v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.01337">arXiv:2410.01337</a> <span> [<a href="https://arxiv.org/pdf/2410.01337">pdf</a>, <a href="https://arxiv.org/format/2410.01337">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computational Engineering, Finance, and Science">cs.CE</span> </div> </div> <p class="title is-5 mathjax"> PhyMPGN: Physics-encoded Message Passing Graph Network for spatiotemporal PDE systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zeng%2C+B">Bocheng Zeng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qi Wang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+M">Mengtao Yan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yang Liu</a>, <a href="/search/cs?searchtype=author&query=Chengze%2C+R">Ruizhi Chengze</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yi Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Hongsheng Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zidong Wang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+H">Hao Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.01337v1-abstract-short" style="display: inline;"> Solving partial differential equations (PDEs) serves as a cornerstone for modeling complex dynamical systems. Recent progresses have demonstrated grand benefits of data-driven neural-based models for predicting spatiotemporal dynamics (e.g., tremendous speedup gain compared with classical numerical methods). However, most existing neural models rely on rich training data, have limited extrapolatio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01337v1-abstract-full').style.display = 'inline'; document.getElementById('2410.01337v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.01337v1-abstract-full" style="display: none;"> Solving partial differential equations (PDEs) serves as a cornerstone for modeling complex dynamical systems. Recent progresses have demonstrated grand benefits of data-driven neural-based models for predicting spatiotemporal dynamics (e.g., tremendous speedup gain compared with classical numerical methods). However, most existing neural models rely on rich training data, have limited extrapolation and generalization abilities, and suffer to produce precise or reliable physical prediction under intricate conditions (e.g., irregular mesh or geometry, complex boundary conditions, diverse PDE parameters, etc.). To this end, we propose a new graph learning approach, namely, Physics-encoded Message Passing Graph Network (PhyMPGN), to model spatiotemporal PDE systems on irregular meshes given small training datasets. Specifically, we incorporate a GNN into a numerical integrator to approximate the temporal marching of spatiotemporal dynamics for a given PDE system. Considering that many physical phenomena are governed by diffusion processes, we further design a learnable Laplace block, which encodes the discrete Laplace-Beltrami operator, to aid and guide the GNN learning in a physically feasible solution space. A boundary condition padding strategy is also designed to improve the model convergence and accuracy. Extensive experiments demonstrate that PhyMPGN is capable of accurately predicting various types of spatiotemporal dynamics on coarse unstructured meshes, consistently achieves the state-of-the-art results, and outperforms other baselines with considerable gains. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01337v1-abstract-full').style.display = 'none'; document.getElementById('2410.01337v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.15542">arXiv:2409.15542</a> <span> [<a href="https://arxiv.org/pdf/2409.15542">pdf</a>, <a href="https://arxiv.org/format/2409.15542">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Ditto: Elastic Confidential VMs with Secure and Dynamic CPU Scaling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Shixuan Zhao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Mengyuan Li</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+M">Mengjia Yan</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Z">Zhiqiang Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.15542v1-abstract-short" style="display: inline;"> Confidential Virtual Machines (CVMs) are a type of VMbased Trusted Execution Environments (TEEs) designed to enhance the security of cloud-based VMs, safeguarding them even from malicious hypervisors. Although CVMs have been widely adopted by major cloud service providers, current CVM designs face significant challenges in runtime resource management due to their fixed capacities and lack of trans… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15542v1-abstract-full').style.display = 'inline'; document.getElementById('2409.15542v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.15542v1-abstract-full" style="display: none;"> Confidential Virtual Machines (CVMs) are a type of VMbased Trusted Execution Environments (TEEs) designed to enhance the security of cloud-based VMs, safeguarding them even from malicious hypervisors. Although CVMs have been widely adopted by major cloud service providers, current CVM designs face significant challenges in runtime resource management due to their fixed capacities and lack of transparency. These limitations hamper efficient cloud resource management, leading to increased operational costs and reduced agility in responding to fluctuating workloads. This paper introduces a dynamic CPU resource management approach, featuring the novel concept of "Elastic CVM. This approach allows for hypervisor-assisted runtime adjustment of CPU resources using a specialized vCPU type, termed Worker vCPU. This new approach enhances CPU resource adaptability and operational efficiency without compromising security. Additionally, we introduce a Worker vCPU Abstraction Layer to simplify Worker vCPU deployment and management. To demonstrate the effectiveness of our approach, we have designed and implemented a serverless computing prototype platform, called Ditto. We show that Ditto significantly improves performance and efficiency through finergrain resource management. The concept of "Elastic CVM" and the Worker vCPU design not only optimize cloud resource utilization but also pave the way for more flexible and cost-effective confidential computing environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15542v1-abstract-full').style.display = 'none'; document.getElementById('2409.15542v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.13557">arXiv:2409.13557</a> <span> [<a href="https://arxiv.org/pdf/2409.13557">pdf</a>, <a href="https://arxiv.org/format/2409.13557">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Trustworthy Hate Speech Detection Through Visual Augmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Ziyuan Yang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+M">Ming Yan</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yingyu Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hui Wang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Z">Zexin Lu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yi Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.13557v1-abstract-short" style="display: inline;"> The surge of hate speech on social media platforms poses a significant challenge, with hate speech detection~(HSD) becoming increasingly critical. Current HSD methods focus on enriching contextual information to enhance detection performance, but they overlook the inherent uncertainty of hate speech. We propose a novel HSD method, named trustworthy hate speech detection method through visual augme… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13557v1-abstract-full').style.display = 'inline'; document.getElementById('2409.13557v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.13557v1-abstract-full" style="display: none;"> The surge of hate speech on social media platforms poses a significant challenge, with hate speech detection~(HSD) becoming increasingly critical. Current HSD methods focus on enriching contextual information to enhance detection performance, but they overlook the inherent uncertainty of hate speech. We propose a novel HSD method, named trustworthy hate speech detection method through visual augmentation (TrusV-HSD), which enhances semantic information through integration with diffused visual images and mitigates uncertainty with trustworthy loss. TrusV-HSD learns semantic representations by effectively extracting trustworthy information through multi-modal connections without paired data. Our experiments on public HSD datasets demonstrate the effectiveness of TrusV-HSD, showing remarkable improvements over conventional methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13557v1-abstract-full').style.display = 'none'; document.getElementById('2409.13557v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.10476">arXiv:2409.10476</a> <span> [<a href="https://arxiv.org/pdf/2409.10476">pdf</a>, <a href="https://arxiv.org/format/2409.10476">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SimInversion: A Simple Framework for Inversion-Based Text-to-Image Editing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qian%2C+Q">Qi Qian</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+H">Haiyang Xu</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+M">Ming Yan</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+J">Juhua Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.10476v1-abstract-short" style="display: inline;"> Diffusion models demonstrate impressive image generation performance with text guidance. Inspired by the learning process of diffusion, existing images can be edited according to text by DDIM inversion. However, the vanilla DDIM inversion is not optimized for classifier-free guidance and the accumulated error will result in the undesired performance. While many algorithms are developed to improve… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.10476v1-abstract-full').style.display = 'inline'; document.getElementById('2409.10476v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.10476v1-abstract-full" style="display: none;"> Diffusion models demonstrate impressive image generation performance with text guidance. Inspired by the learning process of diffusion, existing images can be edited according to text by DDIM inversion. However, the vanilla DDIM inversion is not optimized for classifier-free guidance and the accumulated error will result in the undesired performance. While many algorithms are developed to improve the framework of DDIM inversion for editing, in this work, we investigate the approximation error in DDIM inversion and propose to disentangle the guidance scale for the source and target branches to reduce the error while keeping the original framework. Moreover, a better guidance scale (i.e., 0.5) than default settings can be derived theoretically. Experiments on PIE-Bench show that our proposal can improve the performance of DDIM inversion dramatically without sacrificing efficiency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.10476v1-abstract-full').style.display = 'none'; document.getElementById('2409.10476v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.05486">arXiv:2409.05486</a> <span>  </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Elsevier Arena: Human Evaluation of Chemistry/Biology/Health Foundational Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Thorne%2C+C">Camilo Thorne</a>, <a href="/search/cs?searchtype=author&query=Druckenbrodt%2C+C">Christian Druckenbrodt</a>, <a href="/search/cs?searchtype=author&query=Szarkowska%2C+K">Kinga Szarkowska</a>, <a href="/search/cs?searchtype=author&query=Goyal%2C+D">Deepika Goyal</a>, <a href="/search/cs?searchtype=author&query=Marajan%2C+P">Pranita Marajan</a>, <a href="/search/cs?searchtype=author&query=Somanath%2C+V">Vijay Somanath</a>, <a href="/search/cs?searchtype=author&query=Harper%2C+C">Corey Harper</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+M">Mao Yan</a>, <a href="/search/cs?searchtype=author&query=Scerri%2C+T">Tony Scerri</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.05486v2-abstract-short" style="display: inline;"> arXiv admin comment: This version has been removed by arXiv administrators as the submitter did not have the rights to agree to the license at the time of submission </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.05486v2-abstract-full" style="display: none;"> arXiv admin comment: This version has been removed by arXiv administrators as the submitter did not have the rights to agree to the license at the time of submission <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.05486v2-abstract-full').style.display = 'none'; document.getElementById('2409.05486v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This document was submitted without obtaining all necessary permissions and therefore needs to be withdrawn. The corresponding author apologizes for any inconvenience this might cause</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.03420">arXiv:2409.03420</a> <span> [<a href="https://arxiv.org/pdf/2409.03420">pdf</a>, <a href="https://arxiv.org/format/2409.03420">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> mPLUG-DocOwl2: High-resolution Compressing for OCR-free Multi-page Document Understanding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+A">Anwen Hu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+H">Haiyang Xu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Liang Zhang</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+J">Jiabo Ye</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+M">Ming Yan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Ji Zhang</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+Q">Qin Jin</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+F">Fei Huang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jingren Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.03420v2-abstract-short" style="display: inline;"> Multimodel Large Language Models(MLLMs) have achieved promising OCR-free Document Understanding performance by increasing the supported resolution of document images. However, this comes at the cost of generating thousands of visual tokens for a single document image, leading to excessive GPU memory and slower inference times, particularly in multi-page document comprehension. In this work, to add… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.03420v2-abstract-full').style.display = 'inline'; document.getElementById('2409.03420v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.03420v2-abstract-full" style="display: none;"> Multimodel Large Language Models(MLLMs) have achieved promising OCR-free Document Understanding performance by increasing the supported resolution of document images. However, this comes at the cost of generating thousands of visual tokens for a single document image, leading to excessive GPU memory and slower inference times, particularly in multi-page document comprehension. In this work, to address these challenges, we propose a High-resolution DocCompressor module to compress each high-resolution document image into 324 tokens, guided by low-resolution global visual features. With this compression module, to strengthen multi-page document comprehension ability and balance both token efficiency and question-answering performance, we develop the DocOwl2 under a three-stage training framework: Single-image Pretraining, Multi-image Continue-pretraining, and Multi-task Finetuning. DocOwl2 sets a new state-of-the-art across multi-page document understanding benchmarks and reduces first token latency by more than 50%, demonstrating advanced capabilities in multi-page questioning answering, explanation with evidence pages, and cross-page structure understanding. Additionally, compared to single-image MLLMs trained on similar data, our DocOwl2 achieves comparable single-page understanding performance with less than 20% of the visual tokens. Our codes, models, and data are publicly available at https://github.com/X-PLUG/mPLUG-DocOwl/tree/main/DocOwl2. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.03420v2-abstract-full').style.display = 'none'; document.getElementById('2409.03420v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages, 7 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.00676">arXiv:2409.00676</a> <span> [<a href="https://arxiv.org/pdf/2409.00676">pdf</a>, <a href="https://arxiv.org/format/2409.00676">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> Fixing Code Generation Errors for Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wen%2C+H">Hao Wen</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yueheng Zhu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+C">Chao Liu</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+X">Xiaoxue Ren</a>, <a href="/search/cs?searchtype=author&query=Du%2C+W">Weiwei Du</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+M">Meng Yan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.00676v1-abstract-short" style="display: inline;"> Code generation leverages artificial intelligence technologies, particularly Large Language Models (LLMs), to automatically produce source code, enhancing software development efficiency and reducing repetitive tasks. However, the LLMs' generated code often fails to pass test cases and requires substantial human effort to fix errors. Previous studies focused on better prompts or improving LLMs' ca… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00676v1-abstract-full').style.display = 'inline'; document.getElementById('2409.00676v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.00676v1-abstract-full" style="display: none;"> Code generation leverages artificial intelligence technologies, particularly Large Language Models (LLMs), to automatically produce source code, enhancing software development efficiency and reducing repetitive tasks. However, the LLMs' generated code often fails to pass test cases and requires substantial human effort to fix errors. Previous studies focused on better prompts or improving LLMs' capability but ignored why LLMs failed. In this paper, we first reproduced 14 LLMs, including GPT-3.5-turbo and 13 open-source LLMs, on the HumanEval dataset. We extracted 12,837 code generation errors and conducted an in-depth analysis of their causes, which led to the identification of 19 distinct error causes. Our empirical analysis indicated that three of these causes can be directly fixed. Consequently, we proposed a fixing method called LlmFix, which addresses these three types of errors through a three-step process: filtering code for indentation correction, truncating redundant generated code, and importing missing modules. Experimental results demonstrate that LlmFix can fix these three types of errors, significantly improving the performance of 14 LLMs on HumanEval and MBPP datasets with average increases of 9.5% and 5.4%, respectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00676v1-abstract-full').style.display = 'none'; document.getElementById('2409.00676v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.15089">arXiv:2408.15089</a> <span> [<a href="https://arxiv.org/pdf/2408.15089">pdf</a>, <a href="https://arxiv.org/format/2408.15089">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> SiHGNN: Leveraging Properties of Semantic Graphs for Efficient HGNN Acceleration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xue%2C+R">Runzhen Xue</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+M">Mingyu Yan</a>, <a href="/search/cs?searchtype=author&query=Han%2C+D">Dengke Han</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Z">Zhimin Tang</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+X">Xiaochun Ye</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+D">Dongrui Fan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.15089v1-abstract-short" style="display: inline;"> Heterogeneous Graph Neural Networks (HGNNs) have expanded graph representation learning to heterogeneous graph fields. Recent studies have demonstrated their superior performance across various applications, including medical analysis and recommendation systems, often surpassing existing methods. However, GPUs often experience inefficiencies when executing HGNNs due to their unique and complex exe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.15089v1-abstract-full').style.display = 'inline'; document.getElementById('2408.15089v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.15089v1-abstract-full" style="display: none;"> Heterogeneous Graph Neural Networks (HGNNs) have expanded graph representation learning to heterogeneous graph fields. Recent studies have demonstrated their superior performance across various applications, including medical analysis and recommendation systems, often surpassing existing methods. However, GPUs often experience inefficiencies when executing HGNNs due to their unique and complex execution patterns. Compared to traditional Graph Neural Networks, these patterns further exacerbate irregularities in memory access. To tackle these challenges, recent studies have focused on developing domain-specific accelerators for HGNNs. Nonetheless, most of these efforts have concentrated on optimizing the datapath or scheduling data accesses, while largely overlooking the potential benefits that could be gained from leveraging the inherent properties of the semantic graph, such as its topology, layout, and generation. In this work, we focus on leveraging the properties of semantic graphs to enhance HGNN performance. First, we analyze the Semantic Graph Build (SGB) stage and identify significant opportunities for data reuse during semantic graph generation. Next, we uncover the phenomenon of buffer thrashing during the Graph Feature Processing (GFP) stage, revealing potential optimization opportunities in semantic graph layout. Furthermore, we propose a lightweight hardware accelerator frontend for HGNNs, called SiHGNN. This accelerator frontend incorporates a tree-based Semantic Graph Builder for efficient semantic graph generation and features a novel Graph Restructurer for optimizing semantic graph layouts. Experimental results show that SiHGNN enables the state-of-the-art HGNN accelerator to achieve an average performance improvement of 2.95$\times$. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.15089v1-abstract-full').style.display = 'none'; document.getElementById('2408.15089v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 18 figures. arXiv admin note: text overlap with arXiv:2404.04792</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.12321">arXiv:2408.12321</a> <span> [<a href="https://arxiv.org/pdf/2408.12321">pdf</a>, <a href="https://arxiv.org/format/2408.12321">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> MaVEn: An Effective Multi-granularity Hybrid Visual Encoding Framework for Multimodal Large Language Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+C">Chaoya Jiang</a>, <a href="/search/cs?searchtype=author&query=Hongrui%2C+J">Jia Hongrui</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+H">Haiyang Xu</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+W">Wei Ye</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+M">Mengfan Dong</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+M">Ming Yan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Ji Zhang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+F">Fei Huang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shikun Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.12321v2-abstract-short" style="display: inline;"> This paper presents MaVEn, an innovative Multi-granularity Visual Encoding framework designed to enhance the capabilities of Multimodal Large Language Models (MLLMs) in multi-image reasoning. Current MLLMs primarily focus on single-image visual understanding, limiting their ability to interpret and integrate information across multiple images. MaVEn addresses this limitation by combining discrete… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.12321v2-abstract-full').style.display = 'inline'; document.getElementById('2408.12321v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.12321v2-abstract-full" style="display: none;"> This paper presents MaVEn, an innovative Multi-granularity Visual Encoding framework designed to enhance the capabilities of Multimodal Large Language Models (MLLMs) in multi-image reasoning. Current MLLMs primarily focus on single-image visual understanding, limiting their ability to interpret and integrate information across multiple images. MaVEn addresses this limitation by combining discrete visual symbol sequences, which abstract coarse-grained semantic concepts, with traditional continuous representation sequences that model fine-grained features. This dual approach bridges the semantic gap between visual and textual data, thereby improving the model's ability to process and interpret information from multiple images effectively. Additionally, we design a dynamic reduction mechanism by for long-sequence continuous features to enhance multi-image processing efficiency. Experimental results demonstrate that MaVEn significantly enhances MLLMs' understanding in complex multi-image scenarios, while also improving performance in single-image contexts. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.12321v2-abstract-full').style.display = 'none'; document.getElementById('2408.12321v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.08490">arXiv:2408.08490</a> <span> [<a href="https://arxiv.org/pdf/2408.08490">pdf</a>, <a href="https://arxiv.org/format/2408.08490">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> </div> </div> <p class="title is-5 mathjax"> Accelerating Mini-batch HGNN Training by Reducing CUDA Kernels </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+M">Meng Wu</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+J">Jingkai Qiu</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+M">Mingyu Yan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wenming Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yang Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhimin Zhang</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+X">Xiaochun Ye</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+D">Dongrui Fan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.08490v1-abstract-short" style="display: inline;"> Heterogeneous graph neural networks (HGNNs) are essential for capturing the structure and semantic information in heterogeneous graphs. However, existing GPU-based solutions, such as PyTorch Geometric, suffer from low GPU utilization due to numerous short-execution-time and memory-bound CUDA kernels during HGNN training. To address this issue, we introduce HiFuse, an enhancement for PyTorch Geom… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.08490v1-abstract-full').style.display = 'inline'; document.getElementById('2408.08490v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.08490v1-abstract-full" style="display: none;"> Heterogeneous graph neural networks (HGNNs) are essential for capturing the structure and semantic information in heterogeneous graphs. However, existing GPU-based solutions, such as PyTorch Geometric, suffer from low GPU utilization due to numerous short-execution-time and memory-bound CUDA kernels during HGNN training. To address this issue, we introduce HiFuse, an enhancement for PyTorch Geometric designed to accelerate mini-batch HGNN training on CPU-GPU systems. From the data perspective, we reorganize and merge multiple smaller vertex feature matrices into larger ones, enabling a single kernel to process larger data chunks. This efficiently exploits data locality, reduces the kernel launch overhead, and improves overall GPU utilization. From the workflow perspective, we sophisticatedly offload the construction of semantic graphs from GPU to CPU to reduce the number of CUDA kernels. To meet the parallelism requirements on CPU and ensure seamless execution between CPU and GPU, we employ parallelization techniques including multi-threading and asynchronous pipeline. This allows different stages of the process to overlap, enhancing GPU utilization and reducing end-to-end execution latency, leading to a more efficient and balanced use of computational resources. Through extensive experiments, HiFuse demonstrates an average 2.38 times speedup compared to a state-of-the-art solution. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.08490v1-abstract-full').style.display = 'none'; document.getElementById('2408.08490v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.04998">arXiv:2408.04998</a> <span> [<a href="https://arxiv.org/pdf/2408.04998">pdf</a>, <a href="https://arxiv.org/format/2408.04998">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> ProFuser: Progressive Fusion of Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shi%2C+T">Tianyuan Shi</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+F">Fanqi Wan</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+C">Canbin Huang</a>, <a href="/search/cs?searchtype=author&query=Quan%2C+X">Xiaojun Quan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chenliang Li</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+M">Ming Yan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Ji Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.04998v1-abstract-short" style="display: inline;"> While fusing the capacities and advantages of various large language models (LLMs) offers a pathway to construct more powerful and versatile models, a fundamental challenge is to properly select advantageous model during the training. Existing fusion methods primarily focus on the training mode that uses cross entropy on ground truth in a teacher-forcing setup to measure a model's advantage, which… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.04998v1-abstract-full').style.display = 'inline'; document.getElementById('2408.04998v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.04998v1-abstract-full" style="display: none;"> While fusing the capacities and advantages of various large language models (LLMs) offers a pathway to construct more powerful and versatile models, a fundamental challenge is to properly select advantageous model during the training. Existing fusion methods primarily focus on the training mode that uses cross entropy on ground truth in a teacher-forcing setup to measure a model's advantage, which may provide limited insight towards model advantage. In this paper, we introduce a novel approach that enhances the fusion process by incorporating both the training and inference modes. Our method evaluates model advantage not only through cross entropy during training but also by considering inference outputs, providing a more comprehensive assessment. To combine the two modes effectively, we introduce ProFuser to progressively transition from inference mode to training mode. To validate ProFuser's effectiveness, we fused three models, including vicuna-7b-v1.5, Llama-2-7b-chat, and mpt-7b-8k-chat, and demonstrated the improved performance in knowledge, reasoning, and safety compared to baseline methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.04998v1-abstract-full').style.display = 'none'; document.getElementById('2408.04998v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.04840">arXiv:2408.04840</a> <span> [<a href="https://arxiv.org/pdf/2408.04840">pdf</a>, <a href="https://arxiv.org/format/2408.04840">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> mPLUG-Owl3: Towards Long Image-Sequence Understanding in Multi-Modal Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ye%2C+J">Jiabo Ye</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+H">Haiyang Xu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Haowei Liu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+A">Anwen Hu</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+M">Ming Yan</a>, <a href="/search/cs?searchtype=author&query=Qian%2C+Q">Qi Qian</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Ji Zhang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+F">Fei Huang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jingren Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.04840v2-abstract-short" style="display: inline;"> Multi-modal Large Language Models (MLLMs) have demonstrated remarkable capabilities in executing instructions for a variety of single-image tasks. Despite this progress, significant challenges remain in modeling long image sequences. In this work, we introduce the versatile multi-modal large language model, mPLUG-Owl3, which enhances the capability for long image-sequence understanding in scenario… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.04840v2-abstract-full').style.display = 'inline'; document.getElementById('2408.04840v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.04840v2-abstract-full" style="display: none;"> Multi-modal Large Language Models (MLLMs) have demonstrated remarkable capabilities in executing instructions for a variety of single-image tasks. Despite this progress, significant challenges remain in modeling long image sequences. In this work, we introduce the versatile multi-modal large language model, mPLUG-Owl3, which enhances the capability for long image-sequence understanding in scenarios that incorporate retrieved image-text knowledge, interleaved image-text, and lengthy videos. Specifically, we propose novel hyper attention blocks to efficiently integrate vision and language into a common language-guided semantic space, thereby facilitating the processing of extended multi-image scenarios. Extensive experimental results suggest that mPLUG-Owl3 achieves state-of-the-art performance among models with a similar size on single-image, multi-image, and video benchmarks. Moreover, we propose a challenging long visual sequence evaluation named Distractor Resistance to assess the ability of models to maintain focus amidst distractions. Finally, with the proposed architecture, mPLUG-Owl3 demonstrates outstanding performance on ultra-long visual sequence inputs. We hope that mPLUG-Owl3 can contribute to the development of more efficient and powerful multimodal large language models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.04840v2-abstract-full').style.display = 'none'; document.getElementById('2408.04840v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.02801">arXiv:2408.02801</a> <span> [<a href="https://arxiv.org/pdf/2408.02801">pdf</a>, <a href="https://arxiv.org/ps/2408.02801">ps</a>, <a href="https://arxiv.org/format/2408.02801">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Sparse Deep Learning Models with the $\ell_1$ Regularization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shen%2C+L">Lixin Shen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+R">Rui Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yuesheng Xu</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+M">Mingsong Yan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.02801v1-abstract-short" style="display: inline;"> Sparse neural networks are highly desirable in deep learning in reducing its complexity. The goal of this paper is to study how choices of regularization parameters influence the sparsity level of learned neural networks. We first derive the $\ell_1$-norm sparsity-promoting deep learning models including single and multiple regularization parameters models, from a statistical viewpoint. We then ch… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.02801v1-abstract-full').style.display = 'inline'; document.getElementById('2408.02801v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.02801v1-abstract-full" style="display: none;"> Sparse neural networks are highly desirable in deep learning in reducing its complexity. The goal of this paper is to study how choices of regularization parameters influence the sparsity level of learned neural networks. We first derive the $\ell_1$-norm sparsity-promoting deep learning models including single and multiple regularization parameters models, from a statistical viewpoint. We then characterize the sparsity level of a regularized neural network in terms of the choice of the regularization parameters. Based on the characterizations, we develop iterative algorithms for selecting regularization parameters so that the weight parameters of the resulting deep neural network enjoy prescribed sparsity levels. Numerical experiments are presented to demonstrate the effectiveness of the proposed algorithms in choosing desirable regularization parameters and obtaining corresponding neural networks having both of predetermined sparsity levels and satisfactory approximation accuracy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.02801v1-abstract-full').style.display = 'none'; document.getElementById('2408.02801v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.01902">arXiv:2408.01902</a> <span> [<a href="https://arxiv.org/pdf/2408.01902">pdf</a>, <a href="https://arxiv.org/format/2408.01902">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> </div> </div> <p class="title is-5 mathjax"> A Comprehensive Survey on GNN Characterization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+M">Meng Wu</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+M">Mingyu Yan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wenming Li</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+X">Xiaochun Ye</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+D">Dongrui Fan</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+Y">Yuan Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.01902v2-abstract-short" style="display: inline;"> Characterizing graph neural networks (GNNs) is essential for identifying performance bottlenecks and facilitating their deployment. Despite substantial work in this area, a comprehensive survey on GNN characterization is lacking. This work presents a comprehensive survey, proposing a triple-level classification method to categorize, summarize, and compare existing efforts. In addition, we identify… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.01902v2-abstract-full').style.display = 'inline'; document.getElementById('2408.01902v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.01902v2-abstract-full" style="display: none;"> Characterizing graph neural networks (GNNs) is essential for identifying performance bottlenecks and facilitating their deployment. Despite substantial work in this area, a comprehensive survey on GNN characterization is lacking. This work presents a comprehensive survey, proposing a triple-level classification method to categorize, summarize, and compare existing efforts. In addition, we identify promising future directions for GNN characterization. Our survey aims to help scholars systematically understand GNN performance bottlenecks and patterns from a computer architecture perspective, contributing to more efficient GNN execution. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.01902v2-abstract-full').style.display = 'none'; document.getElementById('2408.01902v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.15272">arXiv:2407.15272</a> <span> [<a href="https://arxiv.org/pdf/2407.15272">pdf</a>, <a href="https://arxiv.org/format/2407.15272">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MIBench: Evaluating Multimodal Large Language Models over Multiple Images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+H">Haowei Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xi Zhang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+H">Haiyang Xu</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yaya Shi</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+C">Chaoya Jiang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+M">Ming Yan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Ji Zhang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+F">Fei Huang</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+C">Chunfeng Yuan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Bing Li</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+W">Weiming Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.15272v2-abstract-short" style="display: inline;"> Built on the power of LLMs, numerous multimodal large language models (MLLMs) have recently achieved remarkable performance on various vision-language tasks. However, most existing MLLMs and benchmarks primarily focus on single-image input scenarios, leaving the performance of MLLMs when handling realistic multiple images underexplored. Although a few benchmarks consider multiple images, their eva… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.15272v2-abstract-full').style.display = 'inline'; document.getElementById('2407.15272v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.15272v2-abstract-full" style="display: none;"> Built on the power of LLMs, numerous multimodal large language models (MLLMs) have recently achieved remarkable performance on various vision-language tasks. However, most existing MLLMs and benchmarks primarily focus on single-image input scenarios, leaving the performance of MLLMs when handling realistic multiple images underexplored. Although a few benchmarks consider multiple images, their evaluation dimensions and samples are very limited. In this paper, we propose a new benchmark MIBench, to comprehensively evaluate fine-grained abilities of MLLMs in multi-image scenarios. Specifically, MIBench categorizes the multi-image abilities into three scenarios: multi-image instruction (MII), multimodal knowledge-seeking (MKS) and multimodal in-context learning (MIC), and constructs 13 tasks with a total of 13K annotated samples. During data construction, for MII and MKS, we extract correct options from manual annotations and create challenging distractors to obtain multiple-choice questions. For MIC, to enable an in-depth evaluation, we set four sub-tasks and transform the original datasets into in-context learning formats. We evaluate several open-source and closed-source MLLMs on the proposed MIBench. The results reveal that although current models excel in single-image tasks, they exhibit significant shortcomings when faced with multi-image inputs, such as limited fine-grained perception, multi-image reasoning and in-context learning abilities. The annotated data of MIBench is available at https://huggingface.co/datasets/StarBottle/MIBench. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.15272v2-abstract-full').style.display = 'none'; document.getElementById('2407.15272v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">EMNLP 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.14355">arXiv:2407.14355</a> <span> [<a href="https://arxiv.org/pdf/2407.14355">pdf</a>, <a href="https://arxiv.org/format/2407.14355">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Zero-shot Audio Classification using Sound Attribute Knowledge from Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xuenan Xu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+P">Pingyue Zhang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+M">Ming Yan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Ji Zhang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+M">Mengyue Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.14355v1-abstract-short" style="display: inline;"> Zero-shot audio classification aims to recognize and classify a sound class that the model has never seen during training. This paper presents a novel approach for zero-shot audio classification using automatically generated sound attribute descriptions. We propose a list of sound attributes and leverage large language model's domain knowledge to generate detailed attribute descriptions for each c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.14355v1-abstract-full').style.display = 'inline'; document.getElementById('2407.14355v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.14355v1-abstract-full" style="display: none;"> Zero-shot audio classification aims to recognize and classify a sound class that the model has never seen during training. This paper presents a novel approach for zero-shot audio classification using automatically generated sound attribute descriptions. We propose a list of sound attributes and leverage large language model's domain knowledge to generate detailed attribute descriptions for each class. In contrast to previous works that primarily relied on class labels or simple descriptions, our method focuses on multi-dimensional innate auditory attributes, capturing different characteristics of sound classes. Additionally, we incorporate a contrastive learning approach to enhance zero-shot learning from textual labels. We validate the effectiveness of our method on VGGSound and AudioSet\footnote{The code is available at \url{https://www.github.com/wsntxxn/AttrEnhZsAc}.}. Our results demonstrate a substantial improvement in zero-shot classification accuracy. Ablation results show robust performance enhancement, regardless of the model architecture. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.14355v1-abstract-full').style.display = 'none'; document.getElementById('2407.14355v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Interspeech 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.13198">arXiv:2407.13198</a> <span> [<a href="https://arxiv.org/pdf/2407.13198">pdf</a>, <a href="https://arxiv.org/format/2407.13198">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> DiveSound: LLM-Assisted Automatic Taxonomy Construction for Diverse Audio Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+B">Baihan Li</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+Z">Zeyu Xie</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xuenan Xu</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Y">Yiwei Guo</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+M">Ming Yan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Ji Zhang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+K">Kai Yu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+M">Mengyue Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.13198v1-abstract-short" style="display: inline;"> Audio generation has attracted significant attention. Despite remarkable enhancement in audio quality, existing models overlook diversity evaluation. This is partially due to the lack of a systematic sound class diversity framework and a matching dataset. To address these issues, we propose DiveSound, a novel framework for constructing multimodal datasets with in-class diversified taxonomy, assist… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.13198v1-abstract-full').style.display = 'inline'; document.getElementById('2407.13198v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.13198v1-abstract-full" style="display: none;"> Audio generation has attracted significant attention. Despite remarkable enhancement in audio quality, existing models overlook diversity evaluation. This is partially due to the lack of a systematic sound class diversity framework and a matching dataset. To address these issues, we propose DiveSound, a novel framework for constructing multimodal datasets with in-class diversified taxonomy, assisted by large language models. As both textual and visual information can be utilized to guide diverse generation, DiveSound leverages multimodal contrastive representations in data construction. Our framework is highly autonomous and can be easily scaled up. We provide a textaudio-image aligned diversity dataset whose sound event class tags have an average of 2.42 subcategories. Text-to-audio experiments on the constructed dataset show a substantial increase of diversity with the help of the guidance of visual information. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.13198v1-abstract-full').style.display = 'none'; document.getElementById('2407.13198v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.12232">arXiv:2407.12232</a> <span> [<a href="https://arxiv.org/pdf/2407.12232">pdf</a>, <a href="https://arxiv.org/format/2407.12232">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> </div> </div> <p class="title is-5 mathjax"> RTL Verification for Secure Speculation Using Contract Shadow Logic </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tan%2C+Q">Qinhan Tan</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yuheng Yang</a>, <a href="/search/cs?searchtype=author&query=Bourgeat%2C+T">Thomas Bourgeat</a>, <a href="/search/cs?searchtype=author&query=Malik%2C+S">Sharad Malik</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+M">Mengjia Yan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.12232v1-abstract-short" style="display: inline;"> Modern out-of-order processors face speculative execution attacks. Despite various proposed software and hardware mitigations to prevent such attacks, new attacks keep arising from unknown vulnerabilities. Thus, a formal and rigorous evaluation of the ability of hardware designs to deal with speculative execution attacks is urgently desired. This paper proposes a formal verification technique call… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.12232v1-abstract-full').style.display = 'inline'; document.getElementById('2407.12232v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.12232v1-abstract-full" style="display: none;"> Modern out-of-order processors face speculative execution attacks. Despite various proposed software and hardware mitigations to prevent such attacks, new attacks keep arising from unknown vulnerabilities. Thus, a formal and rigorous evaluation of the ability of hardware designs to deal with speculative execution attacks is urgently desired. This paper proposes a formal verification technique called Contract Shadow Logic that can considerably improve RTL verification scalability while being applicable to different defense mechanisms. In this technique, we leverage computer architecture design insights to improve verification performance for checking security properties formulated as software-hardware contracts for secure speculation. Our verification scheme is accessible to computer architects and requires minimal formal-method expertise. We evaluate our technique on multiple RTL designs, including three out-of-order processors. The experimental results demonstrate that our technique exhibits a significant advantage in finding attacks on insecure designs and deriving complete proofs on secure designs, when compared to the baseline and two state-of-the-art verification schemes, LEAVE and UPEC. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.12232v1-abstract-full').style.display = 'none'; document.getElementById('2407.12232v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper has been accepted to ASPLOS 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.11790">arXiv:2407.11790</a> <span> [<a href="https://arxiv.org/pdf/2407.11790">pdf</a>, <a href="https://arxiv.org/format/2407.11790">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Performance">cs.PF</span> </div> </div> <p class="title is-5 mathjax"> Characterizing and Understanding HGNN Training on GPUs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Han%2C+D">Dengke Han</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+M">Mingyu Yan</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+X">Xiaochun Ye</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+D">Dongrui Fan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.11790v4-abstract-short" style="display: inline;"> Owing to their remarkable representation capabilities for heterogeneous graph data, Heterogeneous Graph Neural Networks (HGNNs) have been widely adopted in many critical real-world domains such as recommendation systems and medical analysis. Prior to their practical application, identifying the optimal HGNN model parameters tailored to specific tasks through extensive training is a time-consuming… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.11790v4-abstract-full').style.display = 'inline'; document.getElementById('2407.11790v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.11790v4-abstract-full" style="display: none;"> Owing to their remarkable representation capabilities for heterogeneous graph data, Heterogeneous Graph Neural Networks (HGNNs) have been widely adopted in many critical real-world domains such as recommendation systems and medical analysis. Prior to their practical application, identifying the optimal HGNN model parameters tailored to specific tasks through extensive training is a time-consuming and costly process. To enhance the efficiency of HGNN training, it is essential to characterize and analyze the execution semantics and patterns within the training process to identify performance bottlenecks. In this study, we conduct an in-depth quantification and analysis of two mainstream HGNN training scenarios, including single-GPU and multi-GPU distributed training. Based on the characterization results, we disclose the performance bottlenecks and their underlying causes in different HGNN training scenarios and provide optimization guidelines from both software and hardware perspectives. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.11790v4-abstract-full').style.display = 'none'; document.getElementById('2407.11790v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">24 pages, 14 figures, to appear in ACM Transactions on Architecture and Code Optimization (ACM TACO)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.11034">arXiv:2407.11034</a> <span> [<a href="https://arxiv.org/pdf/2407.11034">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Bridging Data Gaps in Healthcare: A Scoping Review of Transfer Learning in Biomedical Data Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+S">Siqi Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xin Li</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+K">Kunyu Yu</a>, <a href="/search/cs?searchtype=author&query=Miao%2C+D">Di Miao</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+M">Mingcheng Zhu</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+M">Mengying Yan</a>, <a href="/search/cs?searchtype=author&query=Ke%2C+Y">Yuhe Ke</a>, <a href="/search/cs?searchtype=author&query=D%27Agostino%2C+D">Danny D'Agostino</a>, <a href="/search/cs?searchtype=author&query=Ning%2C+Y">Yilin Ning</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Q">Qiming Wu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Ziwen Wang</a>, <a href="/search/cs?searchtype=author&query=Shang%2C+Y">Yuqing Shang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+M">Molei Liu</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+C">Chuan Hong</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+N">Nan Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.11034v1-abstract-short" style="display: inline;"> Clinical and biomedical research in low-resource settings often faces significant challenges due to the need for high-quality data with sufficient sample sizes to construct effective models. These constraints hinder robust model training and prompt researchers to seek methods for leveraging existing knowledge from related studies to support new research efforts. Transfer learning (TL), a machine l… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.11034v1-abstract-full').style.display = 'inline'; document.getElementById('2407.11034v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.11034v1-abstract-full" style="display: none;"> Clinical and biomedical research in low-resource settings often faces significant challenges due to the need for high-quality data with sufficient sample sizes to construct effective models. These constraints hinder robust model training and prompt researchers to seek methods for leveraging existing knowledge from related studies to support new research efforts. Transfer learning (TL), a machine learning technique, emerges as a powerful solution by utilizing knowledge from pre-trained models to enhance the performance of new models, offering promise across various healthcare domains. Despite its conceptual origins in the 1990s, the application of TL in medical research has remained limited, especially beyond image analysis. In our review of TL applications in structured clinical and biomedical data, we screened 3,515 papers, with 55 meeting the inclusion criteria. Among these, only 2% (one out of 55) utilized external studies, and 7% (four out of 55) addressed scenarios involving multi-site collaborations with privacy constraints. To achieve actionable TL with structured medical data while addressing regional disparities, inequality, and privacy constraints in healthcare research, we advocate for the careful identification of appropriate source data and models, the selection of suitable TL frameworks, and the validation of TL models with proper baselines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.11034v1-abstract-full').style.display = 'none'; document.getElementById('2407.11034v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.08265">arXiv:2407.08265</a> <span> [<a href="https://arxiv.org/pdf/2407.08265">pdf</a>, <a href="https://arxiv.org/format/2407.08265">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Coordinate-Aware Thermal Infrared Tracking Via Natural Language Modeling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yan%2C+M">Miao Yan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+P">Ping Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Haofei Zhang</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+R">Ruqian Hao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Juanxiu Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiaoyang Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Lin Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.08265v3-abstract-short" style="display: inline;"> Thermal infrared (TIR) tracking is pivotal in computer vision tasks due to its all-weather imaging capability. Traditional tracking methods predominantly rely on hand-crafted features, and while deep learning has introduced correlation filtering techniques, these are often constrained by rudimentary correlation operations. Furthermore, transformer-based approaches tend to overlook temporal and coo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.08265v3-abstract-full').style.display = 'inline'; document.getElementById('2407.08265v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.08265v3-abstract-full" style="display: none;"> Thermal infrared (TIR) tracking is pivotal in computer vision tasks due to its all-weather imaging capability. Traditional tracking methods predominantly rely on hand-crafted features, and while deep learning has introduced correlation filtering techniques, these are often constrained by rudimentary correlation operations. Furthermore, transformer-based approaches tend to overlook temporal and coordinate information, which is critical for TIR tracking that lacks texture and color information. In this paper, to address these issues, we apply natural language modeling to TIR tracking and propose a coordinate-aware thermal infrared tracking model called NLMTrack, which enhances the utilization of coordinate and temporal information. NLMTrack applies an encoder that unifies feature extraction and feature fusion, which simplifies the TIR tracking pipeline. To address the challenge of low detail and low contrast in TIR images, on the one hand, we design a multi-level progressive fusion module that enhances the semantic representation and incorporates multi-scale features. On the other hand, the decoder combines the TIR features and the coordinate sequence features using a causal transformer to generate the target sequence step by step. Moreover, we explore an adaptive loss aimed at elevating tracking accuracy and a simple template update strategy to accommodate the target's appearance variations. Experiments show that NLMTrack achieves state-of-the-art performance on multiple benchmarks. The Code is publicly available at \url{https://github.com/ELOESZHANG/NLMTrack}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.08265v3-abstract-full').style.display = 'none'; document.getElementById('2407.08265v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.07462">arXiv:2407.07462</a> <span> [<a href="https://arxiv.org/pdf/2407.07462">pdf</a>, <a href="https://arxiv.org/format/2407.07462">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> MAN TruckScenes: A multimodal dataset for autonomous trucking in diverse conditions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fent%2C+F">Felix Fent</a>, <a href="/search/cs?searchtype=author&query=Kuttenreich%2C+F">Fabian Kuttenreich</a>, <a href="/search/cs?searchtype=author&query=Ruch%2C+F">Florian Ruch</a>, <a href="/search/cs?searchtype=author&query=Rizwin%2C+F">Farija Rizwin</a>, <a href="/search/cs?searchtype=author&query=Juergens%2C+S">Stefan Juergens</a>, <a href="/search/cs?searchtype=author&query=Lechermann%2C+L">Lorenz Lechermann</a>, <a href="/search/cs?searchtype=author&query=Nissler%2C+C">Christian Nissler</a>, <a href="/search/cs?searchtype=author&query=Perl%2C+A">Andrea Perl</a>, <a href="/search/cs?searchtype=author&query=Voll%2C+U">Ulrich Voll</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+M">Min Yan</a>, <a href="/search/cs?searchtype=author&query=Lienkamp%2C+M">Markus Lienkamp</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.07462v2-abstract-short" style="display: inline;"> Autonomous trucking is a promising technology that can greatly impact modern logistics and the environment. Ensuring its safety on public roads is one of the main duties that requires an accurate perception of the environment. To achieve this, machine learning methods rely on large datasets, but to this day, no such datasets are available for autonomous trucks. In this work, we present MAN TruckSc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.07462v2-abstract-full').style.display = 'inline'; document.getElementById('2407.07462v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.07462v2-abstract-full" style="display: none;"> Autonomous trucking is a promising technology that can greatly impact modern logistics and the environment. Ensuring its safety on public roads is one of the main duties that requires an accurate perception of the environment. To achieve this, machine learning methods rely on large datasets, but to this day, no such datasets are available for autonomous trucks. In this work, we present MAN TruckScenes, the first multimodal dataset for autonomous trucking. MAN TruckScenes allows the research community to come into contact with truck-specific challenges, such as trailer occlusions, novel sensor perspectives, and terminal environments for the first time. It comprises more than 740 scenes of 20s each within a multitude of different environmental conditions. The sensor set includes 4 cameras, 6 lidar, 6 radar sensors, 2 IMUs, and a high-precision GNSS. The dataset's 3D bounding boxes were manually annotated and carefully reviewed to achieve a high quality standard. Bounding boxes are available for 27 object classes, 15 attributes, and a range of more than 230m. The scenes are tagged according to 34 distinct scene tags, and all objects are tracked throughout the scene to promote a wide range of applications. Additionally, MAN TruckScenes is the first dataset to provide 4D radar data with 360掳 coverage and is thereby the largest radar dataset with annotated 3D bounding boxes. Finally, we provide extensive dataset analysis and baseline results. The dataset, development kit, and more are available online. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.07462v2-abstract-full').style.display = 'none'; document.getElementById('2407.07462v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to NeurIPS 2024 Datasets and Benchmarks Track</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.10778">arXiv:2406.10778</a> <span> [<a href="https://arxiv.org/pdf/2406.10778">pdf</a>, <a href="https://arxiv.org/format/2406.10778">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computational Engineering, Finance, and Science">cs.CE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Applications">stat.AP</span> </div> </div> <p class="title is-5 mathjax"> Heterogeneous Entity Representation for Medicinal Synergy Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jiawei Wu</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+J">Jun Wen</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+M">Mingyuan Yan</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+A">Anqi Dong</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+S">Shuai Gao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+R">Ren Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C">Can Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.10778v2-abstract-short" style="display: inline;"> Medicinal synergy prediction is a powerful tool in drug discovery and development that harnesses the principles of combination therapy to enhance therapeutic outcomes by improving efficacy, reducing toxicity, and preventing drug resistance. While a myriad of computational methods has emerged for predicting synergistic drug combinations, a large portion of them may overlook the intricate, yet criti… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.10778v2-abstract-full').style.display = 'inline'; document.getElementById('2406.10778v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.10778v2-abstract-full" style="display: none;"> Medicinal synergy prediction is a powerful tool in drug discovery and development that harnesses the principles of combination therapy to enhance therapeutic outcomes by improving efficacy, reducing toxicity, and preventing drug resistance. While a myriad of computational methods has emerged for predicting synergistic drug combinations, a large portion of them may overlook the intricate, yet critical relationships between various entities in drug interaction networks, such as drugs, cell lines, and diseases. These relationships are complex and multidimensional, requiring sophisticated modeling to capture nuanced interplay that can significantly influence therapeutic efficacy. We introduce a salient deep hypergraph learning method, namely, Heterogeneous Entity Representation for MEdicinal Synergy prediction (HERMES), to predict anti-cancer drug synergy. HERMES integrates heterogeneous data sources, encompassing drug, cell line, and disease information, to provide a comprehensive understanding of the interactions involved. By leveraging advanced hypergraph neural networks with gated residual mechanisms, HERMES can effectively learn complex relationships/interactions within the data. Our results show HERMES demonstrates state-of-the-art performance, particularly in forecasting new drug combinations, significantly surpassing previous methods. This advancement underscores the potential of HERMES to facilitate more effective and precise drug combination predictions, thereby enhancing the development of novel therapeutic strategies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.10778v2-abstract-full').style.display = 'none'; document.getElementById('2406.10778v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages, 3 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 92C50; 05C65; 68T07 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.10227">arXiv:2406.10227</a> <span> [<a href="https://arxiv.org/pdf/2406.10227">pdf</a>, <a href="https://arxiv.org/format/2406.10227">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> VideoGUI: A Benchmark for GUI Automation from Instructional Videos </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+K+Q">Kevin Qinghong Lin</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Linjie Li</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+D">Difei Gao</a>, <a href="/search/cs?searchtype=author&query=WU%2C+Q">Qinchen WU</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+M">Mingyi Yan</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhengyuan Yang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Lijuan Wang</a>, <a href="/search/cs?searchtype=author&query=Shou%2C+M+Z">Mike Zheng Shou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.10227v1-abstract-short" style="display: inline;"> Graphical User Interface (GUI) automation holds significant promise for enhancing human productivity by assisting with computer tasks. Existing task formulations primarily focus on simple tasks that can be specified by a single, language-only instruction, such as "Insert a new slide." In this work, we introduce VideoGUI, a novel multi-modal benchmark designed to evaluate GUI assistants on visual-c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.10227v1-abstract-full').style.display = 'inline'; document.getElementById('2406.10227v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.10227v1-abstract-full" style="display: none;"> Graphical User Interface (GUI) automation holds significant promise for enhancing human productivity by assisting with computer tasks. Existing task formulations primarily focus on simple tasks that can be specified by a single, language-only instruction, such as "Insert a new slide." In this work, we introduce VideoGUI, a novel multi-modal benchmark designed to evaluate GUI assistants on visual-centric GUI tasks. Sourced from high-quality web instructional videos, our benchmark focuses on tasks involving professional and novel software (e.g., Adobe Photoshop or Stable Diffusion WebUI) and complex activities (e.g., video editing). VideoGUI evaluates GUI assistants through a hierarchical process, allowing for identification of the specific levels at which they may fail: (i) high-level planning: reconstruct procedural subtasks from visual conditions without language descriptions; (ii) middle-level planning: generate sequences of precise action narrations based on visual state (i.e., screenshot) and goals; (iii) atomic action execution: perform specific actions such as accurately clicking designated elements. For each level, we design evaluation metrics across individual dimensions to provide clear signals, such as individual performance in clicking, dragging, typing, and scrolling for atomic action execution. Our evaluation on VideoGUI reveals that even the SoTA large multimodal model GPT4o performs poorly on visual-centric GUI tasks, especially for high-level planning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.10227v1-abstract-full').style.display = 'none'; document.getElementById('2406.10227v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">24 pages, 16 tables, 17 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.09095">arXiv:2406.09095</a> <span> [<a href="https://arxiv.org/pdf/2406.09095">pdf</a>, <a href="https://arxiv.org/format/2406.09095">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Modeling Comparative Logical Relation with Contrastive Learning for Text Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dan%2C+Y">Yuhao Dan</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+J">Junfeng Tian</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jie Zhou</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+M">Ming Yan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Ji Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Q">Qin Chen</a>, <a href="/search/cs?searchtype=author&query=He%2C+L">Liang He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.09095v2-abstract-short" style="display: inline;"> Data-to-Text Generation (D2T), a classic natural language generation problem, aims at producing fluent descriptions for structured input data, such as a table. Existing D2T works mainly focus on describing the superficial associative relations among entities, while ignoring the deep comparative logical relations, such as A is better than B in a certain aspect with a corresponding opinion, which is… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.09095v2-abstract-full').style.display = 'inline'; document.getElementById('2406.09095v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.09095v2-abstract-full" style="display: none;"> Data-to-Text Generation (D2T), a classic natural language generation problem, aims at producing fluent descriptions for structured input data, such as a table. Existing D2T works mainly focus on describing the superficial associative relations among entities, while ignoring the deep comparative logical relations, such as A is better than B in a certain aspect with a corresponding opinion, which is quite common in our daily life. In this paper, we introduce a new D2T task named comparative logical relation generation (CLRG). Additionally, we propose a Comparative Logic (CoLo) based text generation method, which generates texts following specific comparative logical relations with contrastive learning. Specifically, we first construct various positive and negative samples by fine-grained perturbations in entities, aspects and opinions. Then, we perform contrastive learning in the encoder layer to have a better understanding of the comparative logical relations, and integrate it in the decoder layer to guide the model to correctly generate the relations. Noting the data scarcity problem, we construct a Chinese Comparative Logical Relation Dataset (CLRD), which is a high-quality human-annotated dataset and challenging for text generation with descriptions of multiple entities and annotations on their comparative logical relations. Extensive experiments show that our method achieves impressive performance in both automatic and human evaluations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.09095v2-abstract-full').style.display = 'none'; document.getElementById('2406.09095v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NLPCC 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.05940">arXiv:2406.05940</a> <span> [<a href="https://arxiv.org/pdf/2406.05940">pdf</a>, <a href="https://arxiv.org/format/2406.05940">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> M2CVD: Enhancing Vulnerability Semantic through Multi-Model Collaboration for Code Vulnerability Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Ziliang Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Ge Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jia Li</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+Y">Yingfei Xiong</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jia Li</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+M">Meng Yan</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+Z">Zhi Jin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.05940v2-abstract-short" style="display: inline;"> Large Language Models (LLMs) have strong capabilities in code comprehension, but fine-tuning costs and semantic alignment issues limit their project-specific optimization; conversely, code models such CodeBERT are easy to fine-tune, but it is often difficult to learn vulnerability semantics from complex code languages. To address these challenges, this paper introduces the Multi-Model Collaborativ… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.05940v2-abstract-full').style.display = 'inline'; document.getElementById('2406.05940v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.05940v2-abstract-full" style="display: none;"> Large Language Models (LLMs) have strong capabilities in code comprehension, but fine-tuning costs and semantic alignment issues limit their project-specific optimization; conversely, code models such CodeBERT are easy to fine-tune, but it is often difficult to learn vulnerability semantics from complex code languages. To address these challenges, this paper introduces the Multi-Model Collaborative Vulnerability Detection approach (M2CVD) that leverages the strong capability of analyzing vulnerability semantics from LLMs to improve the detection accuracy of code models. M2CVD employs a novel collaborative process: first enhancing the quality of vulnerability semantic description produced by LLMs through the understanding of project code by code models, and then using these improved vulnerability semantic description to boost the detection accuracy of code models. We demonstrated M2CVD's effectiveness on two real-world datasets, where M2CVD significantly outperformed the baseline. In addition, we demonstrate that the M2CVD collaborative method can extend to other different LLMs and code models to improve their accuracy in vulnerability detection tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.05940v2-abstract-full').style.display = 'none'; document.getElementById('2406.05940v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.04648">arXiv:2406.04648</a> <span> [<a href="https://arxiv.org/pdf/2406.04648">pdf</a>, <a href="https://arxiv.org/format/2406.04648">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> UCDNet: Multi-UAV Collaborative 3D Object Detection Network by Reliable Feature Mapping </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tian%2C+P">Pengju Tian</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+P">Peirui Cheng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yuchao Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhechao Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhirui Wang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+M">Menglong Yan</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xue Yang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+X">Xian Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.04648v1-abstract-short" style="display: inline;"> Multi-UAV collaborative 3D object detection can perceive and comprehend complex environments by integrating complementary information, with applications encompassing traffic monitoring, delivery services and agricultural management. However, the extremely broad observations in aerial remote sensing and significant perspective differences across multiple UAVs make it challenging to achieve precise… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.04648v1-abstract-full').style.display = 'inline'; document.getElementById('2406.04648v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.04648v1-abstract-full" style="display: none;"> Multi-UAV collaborative 3D object detection can perceive and comprehend complex environments by integrating complementary information, with applications encompassing traffic monitoring, delivery services and agricultural management. However, the extremely broad observations in aerial remote sensing and significant perspective differences across multiple UAVs make it challenging to achieve precise and consistent feature mapping from 2D images to 3D space in multi-UAV collaborative 3D object detection paradigm. To address the problem, we propose an unparalleled camera-based multi-UAV collaborative 3D object detection paradigm called UCDNet. Specifically, the depth information from the UAVs to the ground is explicitly utilized as a strong prior to provide a reference for more accurate and generalizable feature mapping. Additionally, we design a homologous points geometric consistency loss as an auxiliary self-supervision, which directly influences the feature mapping module, thereby strengthening the global consistency of multi-view perception. Experiments on AeroCollab3D and CoPerception-UAVs datasets show our method increases 4.7% and 10% mAP respectively compared to the baseline, which demonstrates the superiority of UCDNet. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.04648v1-abstract-full').style.display = 'none'; document.getElementById('2406.04648v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.03210">arXiv:2406.03210</a> <span> [<a href="https://arxiv.org/pdf/2406.03210">pdf</a>, <a href="https://arxiv.org/format/2406.03210">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Text-like Encoding of Collaborative Information in Large Language Models for Recommendation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yang Zhang</a>, <a href="/search/cs?searchtype=author&query=Bao%2C+K">Keqin Bao</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+M">Ming Yan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wenjie Wang</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+F">Fuli Feng</a>, <a href="/search/cs?searchtype=author&query=He%2C+X">Xiangnan He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.03210v1-abstract-short" style="display: inline;"> When adapting Large Language Models for Recommendation (LLMRec), it is crucial to integrate collaborative information. Existing methods achieve this by learning collaborative embeddings in LLMs' latent space from scratch or by mapping from external models. However, they fail to represent the information in a text-like format, which may not align optimally with LLMs. To bridge this gap, we introduc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.03210v1-abstract-full').style.display = 'inline'; document.getElementById('2406.03210v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.03210v1-abstract-full" style="display: none;"> When adapting Large Language Models for Recommendation (LLMRec), it is crucial to integrate collaborative information. Existing methods achieve this by learning collaborative embeddings in LLMs' latent space from scratch or by mapping from external models. However, they fail to represent the information in a text-like format, which may not align optimally with LLMs. To bridge this gap, we introduce BinLLM, a novel LLMRec method that seamlessly integrates collaborative information through text-like encoding. BinLLM converts collaborative embeddings from external models into binary sequences -- a specific text format that LLMs can understand and operate on directly, facilitating the direct usage of collaborative information in text-like format by LLMs. Additionally, BinLLM provides options to compress the binary sequence using dot-decimal notation to avoid excessively long lengths. Extensive experiments validate that BinLLM introduces collaborative information in a manner better aligned with LLMs, resulting in enhanced performance. We release our code at https://github.com/zyang1580/BinLLM. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.03210v1-abstract-full').style.display = 'none'; document.getElementById('2406.03210v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ACL 2024</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> H.3.3 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.03172">arXiv:2406.03172</a> <span> [<a href="https://arxiv.org/pdf/2406.03172">pdf</a>, <a href="https://arxiv.org/format/2406.03172">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Initialization-enhanced Physics-Informed Neural Network with Domain Decomposition (IDPINN) </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Si%2C+C">Chenhao Si</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+M">Ming Yan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.03172v1-abstract-short" style="display: inline;"> We propose a new physics-informed neural network framework, IDPINN, based on the enhancement of initialization and domain decomposition to improve prediction accuracy. We train a PINN using a small dataset to obtain an initial network structure, including the weighted matrix and bias, which initializes the PINN for each subdomain. Moreover, we leverage the smoothness condition on the interface to… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.03172v1-abstract-full').style.display = 'inline'; document.getElementById('2406.03172v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.03172v1-abstract-full" style="display: none;"> We propose a new physics-informed neural network framework, IDPINN, based on the enhancement of initialization and domain decomposition to improve prediction accuracy. We train a PINN using a small dataset to obtain an initial network structure, including the weighted matrix and bias, which initializes the PINN for each subdomain. Moreover, we leverage the smoothness condition on the interface to enhance the prediction performance. We numerically evaluated it on several forward problems and demonstrated the benefits of IDPINN in terms of accuracy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.03172v1-abstract-full').style.display = 'none'; document.getElementById('2406.03172v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">20 pages, 14 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.01014">arXiv:2406.01014</a> <span> [<a href="https://arxiv.org/pdf/2406.01014">pdf</a>, <a href="https://arxiv.org/format/2406.01014">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Mobile-Agent-v2: Mobile Device Operation Assistant with Effective Navigation via Multi-Agent Collaboration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+J">Junyang Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+H">Haiyang Xu</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+H">Haitao Jia</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xi Zhang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+M">Ming Yan</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+W">Weizhou Shen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Ji Zhang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+F">Fei Huang</a>, <a href="/search/cs?searchtype=author&query=Sang%2C+J">Jitao Sang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.01014v1-abstract-short" style="display: inline;"> Mobile device operation tasks are increasingly becoming a popular multi-modal AI application scenario. Current Multi-modal Large Language Models (MLLMs), constrained by their training data, lack the capability to function effectively as operation assistants. Instead, MLLM-based agents, which enhance capabilities through tool invocation, are gradually being applied to this scenario. However, the tw… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.01014v1-abstract-full').style.display = 'inline'; document.getElementById('2406.01014v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.01014v1-abstract-full" style="display: none;"> Mobile device operation tasks are increasingly becoming a popular multi-modal AI application scenario. Current Multi-modal Large Language Models (MLLMs), constrained by their training data, lack the capability to function effectively as operation assistants. Instead, MLLM-based agents, which enhance capabilities through tool invocation, are gradually being applied to this scenario. However, the two major navigation challenges in mobile device operation tasks, task progress navigation and focus content navigation, are significantly complicated under the single-agent architecture of existing work. This is due to the overly long token sequences and the interleaved text-image data format, which limit performance. To address these navigation challenges effectively, we propose Mobile-Agent-v2, a multi-agent architecture for mobile device operation assistance. The architecture comprises three agents: planning agent, decision agent, and reflection agent. The planning agent generates task progress, making the navigation of history operations more efficient. To retain focus content, we design a memory unit that updates with task progress. Additionally, to correct erroneous operations, the reflection agent observes the outcomes of each operation and handles any mistakes accordingly. Experimental results indicate that Mobile-Agent-v2 achieves over a 30% improvement in task completion compared to the single-agent architecture of Mobile-Agent. The code is open-sourced at https://github.com/X-PLUG/MobileAgent. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.01014v1-abstract-full').style.display = 'none'; document.getElementById('2406.01014v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">22 pages, 11 figures, 10 Tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.00988">arXiv:2406.00988</a> <span> [<a href="https://arxiv.org/pdf/2406.00988">pdf</a>, <a href="https://arxiv.org/format/2406.00988">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> </div> </div> <p class="title is-5 mathjax"> ADE-HGNN: Accelerating HGNNs through Attention Disparity Exploitation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Han%2C+D">Dengke Han</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+M">Meng Wu</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+R">Runzhen Xue</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+M">Mingyu Yan</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+X">Xiaochun Ye</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+D">Dongrui Fan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.00988v1-abstract-short" style="display: inline;"> Heterogeneous Graph Neural Networks (HGNNs) have recently demonstrated great power in handling heterogeneous graph data, rendering them widely applied in many critical real-world domains. Most HGNN models leverage attention mechanisms to significantly improvemodel accuracy, albeit at the cost of increased computational complexity and memory bandwidth requirements. Fortunately, the attention dispar… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.00988v1-abstract-full').style.display = 'inline'; document.getElementById('2406.00988v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.00988v1-abstract-full" style="display: none;"> Heterogeneous Graph Neural Networks (HGNNs) have recently demonstrated great power in handling heterogeneous graph data, rendering them widely applied in many critical real-world domains. Most HGNN models leverage attention mechanisms to significantly improvemodel accuracy, albeit at the cost of increased computational complexity and memory bandwidth requirements. Fortunately, the attention disparity from source vertices towards a common target vertex unveils an opportunity to boost the model execution performance by pruning unimportant source vertices during neighbor aggregation. In this study, we commence with a quantitative analysis of the attention disparity in HGNN models, where the importance of different source vertices varies for the same target vertex. To fully exploit this finding for inference acceleration, we propose a runtime pruning method based on min-heap and map it to a dedicated hardware pruner to discard unimportant vertices. Given that the pruning overhead itself is non-negligible and cannot be amortized by conventional staged execution paradigm, an operation-fusion execution fow of HGNNs is introduced to overlap the pruning overhead while harnessing inter-stage parallelism. Finally, we present the design of a novel HGNN accelerator, ADE-HGNN, tailored to support the proposed execution framework. Our experimental results demonstrate that ADE-HGNN achieves an average performance improvement of 28.21x over the NVIDIA GPU T4 platform and 7.98x over the advanced GPU A100, with the inference accuracy loss kept within a negligible range of 0.11%~1.47%. Furthermore, ADE-HGNN significantly reduces energy consumption to 1.97% and 5.37% of the two platforms, respectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.00988v1-abstract-full').style.display = 'none'; document.getElementById('2406.00988v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages, 9 figures, accepted by Euro-PAR 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.00683">arXiv:2406.00683</a> <span> [<a href="https://arxiv.org/pdf/2406.00683">pdf</a>, <a href="https://arxiv.org/format/2406.00683">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> Exploiting Frequency Correlation for Hyperspectral Image Reconstruction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yan%2C+M">Muge Yan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Lizhi Wang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+L">Lin Zhu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+H">Hua Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.00683v1-abstract-short" style="display: inline;"> Deep priors have emerged as potent methods in hyperspectral image (HSI) reconstruction. While most methods emphasize space-domain learning using image space priors like non-local similarity, frequency-domain learning using image frequency priors remains neglected, limiting the reconstruction capability of networks. In this paper, we first propose a Hyperspectral Frequency Correlation (HFC) prior r… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.00683v1-abstract-full').style.display = 'inline'; document.getElementById('2406.00683v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.00683v1-abstract-full" style="display: none;"> Deep priors have emerged as potent methods in hyperspectral image (HSI) reconstruction. While most methods emphasize space-domain learning using image space priors like non-local similarity, frequency-domain learning using image frequency priors remains neglected, limiting the reconstruction capability of networks. In this paper, we first propose a Hyperspectral Frequency Correlation (HFC) prior rooted in in-depth statistical frequency analyses of existent HSI datasets. Leveraging the HFC prior, we subsequently establish the frequency domain learning composed of a Spectral-wise self-Attention of Frequency (SAF) and a Spectral-spatial Interaction of Frequency (SIF) targeting low-frequency and high-frequency components, respectively. The outputs of SAF and SIF are adaptively merged by a learnable gating filter, thus achieving a thorough exploitation of image frequency priors. Integrating the frequency domain learning and the existing space domain learning, we finally develop the Correlation-driven Mixing Domains Transformer (CMDT) for HSI reconstruction. Extensive experiments highlight that our method surpasses various state-of-the-art (SOTA) methods in reconstruction quality and computational efficiency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.00683v1-abstract-full').style.display = 'none'; document.getElementById('2406.00683v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">14 pages, 11 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.06247">arXiv:2405.06247</a> <span> [<a href="https://arxiv.org/pdf/2405.06247">pdf</a>, <a href="https://arxiv.org/format/2405.06247">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Disttack: Graph Adversarial Attacks Toward Distributed GNN Training </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yuxiang Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xin Liu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+M">Meng Wu</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+W">Wei Yan</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+M">Mingyu Yan</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+X">Xiaochun Ye</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+D">Dongrui Fan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.06247v1-abstract-short" style="display: inline;"> Graph Neural Networks (GNNs) have emerged as potent models for graph learning. Distributing the training process across multiple computing nodes is the most promising solution to address the challenges of ever-growing real-world graphs. However, current adversarial attack methods on GNNs neglect the characteristics and applications of the distributed scenario, leading to suboptimal performance and… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.06247v1-abstract-full').style.display = 'inline'; document.getElementById('2405.06247v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.06247v1-abstract-full" style="display: none;"> Graph Neural Networks (GNNs) have emerged as potent models for graph learning. Distributing the training process across multiple computing nodes is the most promising solution to address the challenges of ever-growing real-world graphs. However, current adversarial attack methods on GNNs neglect the characteristics and applications of the distributed scenario, leading to suboptimal performance and inefficiency in attacking distributed GNN training. In this study, we introduce Disttack, the first framework of adversarial attacks for distributed GNN training that leverages the characteristics of frequent gradient updates in a distributed system. Specifically, Disttack corrupts distributed GNN training by injecting adversarial attacks into one single computing node. The attacked subgraphs are precisely perturbed to induce an abnormal gradient ascent in backpropagation, disrupting gradient synchronization between computing nodes and thus leading to a significant performance decline of the trained GNN. We evaluate Disttack on four large real-world graphs by attacking five widely adopted GNNs. Compared with the state-of-the-art attack method, experimental results demonstrate that Disttack amplifies the model accuracy degradation by 2.75$\times$ and achieves speedup by 17.33$\times$ on average while maintaining unnoticeability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.06247v1-abstract-full').style.display = 'none'; document.getElementById('2405.06247v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by 30th International European Conference on Parallel and Distributed Computing(Euro-Par 2024)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.18166">arXiv:2404.18166</a> <span> [<a href="https://arxiv.org/pdf/2404.18166">pdf</a>, <a href="https://arxiv.org/format/2404.18166">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Behavior-Contextualized Item Preference Modeling for Multi-Behavior Recommendation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yan%2C+M">Mingshi Yan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+F">Fan Liu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+J">Jing Sun</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+F">Fuming Sun</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+Z">Zhiyong Cheng</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Y">Yahong Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.18166v1-abstract-short" style="display: inline;"> In recommender systems, multi-behavior methods have demonstrated their effectiveness in mitigating issues like data sparsity, a common challenge in traditional single-behavior recommendation approaches. These methods typically infer user preferences from various auxiliary behaviors and apply them to the target behavior for recommendations. However, this direct transfer can introduce noise to the t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.18166v1-abstract-full').style.display = 'inline'; document.getElementById('2404.18166v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.18166v1-abstract-full" style="display: none;"> In recommender systems, multi-behavior methods have demonstrated their effectiveness in mitigating issues like data sparsity, a common challenge in traditional single-behavior recommendation approaches. These methods typically infer user preferences from various auxiliary behaviors and apply them to the target behavior for recommendations. However, this direct transfer can introduce noise to the target behavior in recommendation, due to variations in user attention across different behaviors. To address this issue, this paper introduces a novel approach, Behavior-Contextualized Item Preference Modeling (BCIPM), for multi-behavior recommendation. Our proposed Behavior-Contextualized Item Preference Network discerns and learns users' specific item preferences within each behavior. It then considers only those preferences relevant to the target behavior for final recommendations, significantly reducing noise from auxiliary behaviors. These auxiliary behaviors are utilized solely for training the network parameters, thereby refining the learning process without compromising the accuracy of the target behavior recommendations. To further enhance the effectiveness of BCIPM, we adopt a strategy of pre-training the initial embeddings. This step is crucial for enriching the item-aware preferences, particularly in scenarios where data related to the target behavior is sparse. Comprehensive experiments conducted on four real-world datasets demonstrate BCIPM's superior performance compared to several leading state-of-the-art models, validating the robustness and efficiency of our proposed approach. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.18166v1-abstract-full').style.display = 'none'; document.getElementById('2404.18166v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper has been accepted by SIGIR 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.17238">arXiv:2404.17238</a> <span> [<a href="https://arxiv.org/pdf/2404.17238">pdf</a>, <a href="https://arxiv.org/format/2404.17238">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> TruthSR: Trustworthy Sequential Recommender Systems via User-generated Multimodal Content </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yan%2C+M">Meng Yan</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+H">Haibin Huang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Ying Liu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+J">Juan Zhao</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+X">Xiyue Gao</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+C">Cai Xu</a>, <a href="/search/cs?searchtype=author&query=Guan%2C+Z">Ziyu Guan</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+W">Wei Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.17238v1-abstract-short" style="display: inline;"> Sequential recommender systems explore users' preferences and behavioral patterns from their historically generated data. Recently, researchers aim to improve sequential recommendation by utilizing massive user-generated multi-modal content, such as reviews, images, etc. This content often contains inevitable noise. Some studies attempt to reduce noise interference by suppressing cross-modal incon… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.17238v1-abstract-full').style.display = 'inline'; document.getElementById('2404.17238v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.17238v1-abstract-full" style="display: none;"> Sequential recommender systems explore users' preferences and behavioral patterns from their historically generated data. Recently, researchers aim to improve sequential recommendation by utilizing massive user-generated multi-modal content, such as reviews, images, etc. This content often contains inevitable noise. Some studies attempt to reduce noise interference by suppressing cross-modal inconsistent information. However, they could potentially constrain the capturing of personalized user preferences. In addition, it is almost impossible to entirely eliminate noise in diverse user-generated multi-modal content. To solve these problems, we propose a trustworthy sequential recommendation method via noisy user-generated multi-modal content. Specifically, we explicitly capture the consistency and complementarity of user-generated multi-modal content to mitigate noise interference. We also achieve the modeling of the user's multi-modal sequential preferences. In addition, we design a trustworthy decision mechanism that integrates subjective user perspective and objective item perspective to dynamically evaluate the uncertainty of prediction results. Experimental evaluation on four widely-used datasets demonstrates the superior performance of our model compared to state-of-the-art methods. The code is released at https://github.com/FairyMeng/TrustSR. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.17238v1-abstract-full').style.display = 'none'; document.getElementById('2404.17238v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.16635">arXiv:2404.16635</a> <span> [<a href="https://arxiv.org/pdf/2404.16635">pdf</a>, <a href="https://arxiv.org/format/2404.16635">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> TinyChart: Efficient Chart Understanding with Visual Token Merging and Program-of-Thoughts Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Liang Zhang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+A">Anwen Hu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+H">Haiyang Xu</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+M">Ming Yan</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yichen Xu</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+Q">Qin Jin</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Ji Zhang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+F">Fei Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.16635v1-abstract-short" style="display: inline;"> Charts are important for presenting and explaining complex data relationships. Recently, multimodal large language models (MLLMs) have shown remarkable capabilities in various chart understanding tasks. However, the sheer size of these models in terms of parameters and computational requirements limits their use in resource-constrained environments. In this paper, we present TinyChart, an efficien… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.16635v1-abstract-full').style.display = 'inline'; document.getElementById('2404.16635v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.16635v1-abstract-full" style="display: none;"> Charts are important for presenting and explaining complex data relationships. Recently, multimodal large language models (MLLMs) have shown remarkable capabilities in various chart understanding tasks. However, the sheer size of these models in terms of parameters and computational requirements limits their use in resource-constrained environments. In this paper, we present TinyChart, an efficient MLLM for chart understanding with only 3B parameters. TinyChart overcomes two key challenges in efficient chart understanding: (1) reduce the burden of learning numerical computations through a Program-of-Thoughts (PoT) learning strategy, which trains the model to generate Python programs for numerical calculations, and (2) reduce lengthy vision feature sequences produced by the vision transformer for high-resolution images through a Vision Token Merging module, which gradually merges most similar vision tokens. Extensive experiments demonstrate that our 3B TinyChart achieves SOTA performance on a variety of chart understanding benchmarks including ChartQA, Chart-to-Text, Chart-to-Table, OpenCQA, and ChartX. It outperforms several chart understanding MLLM with up to 13B parameters such as ChartLlama and ChartAst, and close-sourced general-purpose MLLM GPT-4V on ChartQA. It also demonstrates its superior efficiency with higher throughput during inference due to a smaller model scale and more efficient vision encoding. Our code and model are available at https://github.com/X-PLUG/mPLUG-DocOwl/tree/main/TinyChart. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.16635v1-abstract-full').style.display = 'none'; document.getElementById('2404.16635v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 11 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.16484">arXiv:2404.16484</a> <span> [<a href="https://arxiv.org/pdf/2404.16484">pdf</a>, <a href="https://arxiv.org/format/2404.16484">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Real-Time 4K Super-Resolution of Compressed AVIF Images. AIS 2024 Challenge Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Conde%2C+M+V">Marcos V. Conde</a>, <a href="/search/cs?searchtype=author&query=Lei%2C+Z">Zhijun Lei</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wen Li</a>, <a href="/search/cs?searchtype=author&query=Stejerean%2C+C">Cosmin Stejerean</a>, <a href="/search/cs?searchtype=author&query=Katsavounidis%2C+I">Ioannis Katsavounidis</a>, <a href="/search/cs?searchtype=author&query=Timofte%2C+R">Radu Timofte</a>, <a href="/search/cs?searchtype=author&query=Yoon%2C+K">Kihwan Yoon</a>, <a href="/search/cs?searchtype=author&query=Gankhuyag%2C+G">Ganzorig Gankhuyag</a>, <a href="/search/cs?searchtype=author&query=Lv%2C+J">Jiangtao Lv</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+L">Long Sun</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+J">Jinshan Pan</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+J">Jiangxin Dong</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+J">Jinhui Tang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhiyuan Li</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+H">Hao Wei</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+C">Chenyang Ge</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+D">Dongyang Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+T">Tianle Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Huaian Chen</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+Y">Yi Jin</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+M">Menghan Zhou</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+Y">Yiqiang Yan</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+S">Si Gao</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+B">Biao Wu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Shaoli Liu</a> , et al. (50 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.16484v1-abstract-short" style="display: inline;"> This paper introduces a novel benchmark as part of the AIS 2024 Real-Time Image Super-Resolution (RTSR) Challenge, which aims to upscale compressed images from 540p to 4K resolution (4x factor) in real-time on commercial GPUs. For this, we use a diverse test set containing a variety of 4K images ranging from digital art to gaming and photography. The images are compressed using the modern AVIF cod… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.16484v1-abstract-full').style.display = 'inline'; document.getElementById('2404.16484v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.16484v1-abstract-full" style="display: none;"> This paper introduces a novel benchmark as part of the AIS 2024 Real-Time Image Super-Resolution (RTSR) Challenge, which aims to upscale compressed images from 540p to 4K resolution (4x factor) in real-time on commercial GPUs. For this, we use a diverse test set containing a variety of 4K images ranging from digital art to gaming and photography. The images are compressed using the modern AVIF codec, instead of JPEG. All the proposed methods improve PSNR fidelity over Lanczos interpolation, and process images under 10ms. Out of the 160 participants, 25 teams submitted their code and models. The solutions present novel designs tailored for memory-efficiency and runtime on edge devices. This survey describes the best solutions for real-time SR of compressed high-resolution images. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.16484v1-abstract-full').style.display = 'none'; document.getElementById('2404.16484v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CVPR 2024, AI for Streaming (AIS) Workshop</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.10343">arXiv:2404.10343</a> <span> [<a href="https://arxiv.org/pdf/2404.10343">pdf</a>, <a href="https://arxiv.org/format/2404.10343">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> The Ninth NTIRE 2024 Efficient Super-Resolution Challenge Report </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ren%2C+B">Bin Ren</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yawei Li</a>, <a href="/search/cs?searchtype=author&query=Mehta%2C+N">Nancy Mehta</a>, <a href="/search/cs?searchtype=author&query=Timofte%2C+R">Radu Timofte</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+H">Hongyuan Yu</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+C">Cheng Wan</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+Y">Yuxin Hong</a>, <a href="/search/cs?searchtype=author&query=Han%2C+B">Bingnan Han</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Zhuoyuan Wu</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+Y">Yajun Zou</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yuqing Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jizhe Li</a>, <a href="/search/cs?searchtype=author&query=He%2C+K">Keji He</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+C">Chao Fan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Heng Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiaolin Zhang</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+X">Xuanwu Yin</a>, <a href="/search/cs?searchtype=author&query=Zuo%2C+K">Kunlong Zuo</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+B">Bohao Liao</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+P">Peizhe Xia</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+L">Long Peng</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Z">Zhibo Du</a>, <a href="/search/cs?searchtype=author&query=Di%2C+X">Xin Di</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wangkai Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yang Wang</a> , et al. (109 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.10343v2-abstract-short" style="display: inline;"> This paper provides a comprehensive review of the NTIRE 2024 challenge, focusing on efficient single-image super-resolution (ESR) solutions and their outcomes. The task of this challenge is to super-resolve an input image with a magnification factor of x4 based on pairs of low and corresponding high-resolution images. The primary objective is to develop networks that optimize various aspects such… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.10343v2-abstract-full').style.display = 'inline'; document.getElementById('2404.10343v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.10343v2-abstract-full" style="display: none;"> This paper provides a comprehensive review of the NTIRE 2024 challenge, focusing on efficient single-image super-resolution (ESR) solutions and their outcomes. The task of this challenge is to super-resolve an input image with a magnification factor of x4 based on pairs of low and corresponding high-resolution images. The primary objective is to develop networks that optimize various aspects such as runtime, parameters, and FLOPs, while still maintaining a peak signal-to-noise ratio (PSNR) of approximately 26.90 dB on the DIV2K_LSDIR_valid dataset and 26.99 dB on the DIV2K_LSDIR_test dataset. In addition, this challenge has 4 tracks including the main track (overall performance), sub-track 1 (runtime), sub-track 2 (FLOPs), and sub-track 3 (parameters). In the main track, all three metrics (ie runtime, FLOPs, and parameter count) were considered. The ranking of the main track is calculated based on a weighted sum-up of the scores of all other sub-tracks. In sub-track 1, the practical runtime performance of the submissions was evaluated, and the corresponding score was used to determine the ranking. In sub-track 2, the number of FLOPs was considered. The score calculated based on the corresponding FLOPs was used to determine the ranking. In sub-track 3, the number of parameters was considered. The score calculated based on the corresponding parameters was used to determine the ranking. RLFN is set as the baseline for efficiency measurement. The challenge had 262 registered participants, and 34 teams made valid submissions. They gauge the state-of-the-art in efficient single-image super-resolution. To facilitate the reproducibility of the challenge and enable other researchers to build upon these findings, the code and the pre-trained model of validated solutions are made publicly available at https://github.com/Amazingren/NTIRE2024_ESR/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.10343v2-abstract-full').style.display = 'none'; document.getElementById('2404.10343v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The report paper of NTIRE2024 Efficient Super-resolution, accepted by CVPRW2024</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Yan%2C+M&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Yan%2C+M&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Yan%2C+M&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Yan%2C+M&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Yan%2C+M&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Yan%2C+M&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li> <a href="/search/?searchtype=author&query=Yan%2C+M&start=250" class="pagination-link " aria-label="Page 6" aria-current="page">6 </a> </li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository