Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 9,134 results for author: <span class="mathjax">Zhang, Y</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Zhang%2C+Y">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Zhang, Y"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Zhang%2C+Y&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Zhang, Y"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Zhang%2C+Y&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Zhang%2C+Y&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+Y&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+Y&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+Y&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+Y&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14421">arXiv:2411.14421</a> <span> [<a href="https://arxiv.org/pdf/2411.14421">pdf</a>, <a href="https://arxiv.org/format/2411.14421">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> From RNNs to Foundation Models: An Empirical Study on Commercial Building Energy Consumption </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bose%2C+S">Shourya Bose</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yijiang Li</a>, <a href="/search/cs?searchtype=author&query=Van+Sant%2C+A">Amy Van Sant</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yu Zhang</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+K">Kibaek Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14421v1-abstract-short" style="display: inline;"> Accurate short-term energy consumption forecasting for commercial buildings is crucial for smart grid operations. While smart meters and deep learning models enable forecasting using past data from multiple buildings, data heterogeneity from diverse buildings can reduce model performance. The impact of increasing dataset heterogeneity in time series forecasting, while keeping size and model consta… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14421v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14421v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14421v1-abstract-full" style="display: none;"> Accurate short-term energy consumption forecasting for commercial buildings is crucial for smart grid operations. While smart meters and deep learning models enable forecasting using past data from multiple buildings, data heterogeneity from diverse buildings can reduce model performance. The impact of increasing dataset heterogeneity in time series forecasting, while keeping size and model constant, is understudied. We tackle this issue using the ComStock dataset, which provides synthetic energy consumption data for U.S. commercial buildings. Two curated subsets, identical in size and region but differing in building type diversity, are used to assess the performance of various time series forecasting models, including fine-tuned open-source foundation models (FMs). The results show that dataset heterogeneity and model architecture have a greater impact on post-training forecasting performance than the parameter count. Moreover, despite the higher computational cost, fine-tuned FMs demonstrate competitive performance compared to base models trained from scratch. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14421v1-abstract-full').style.display = 'none'; document.getElementById('2411.14421v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2024 Workshop on Time Series in the Age of Large Models</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14401">arXiv:2411.14401</a> <span> [<a href="https://arxiv.org/pdf/2411.14401">pdf</a>, <a href="https://arxiv.org/format/2411.14401">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Beyond Training: Dynamic Token Merging for Zero-Shot Video Understanding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yiming Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Zhuokai Zhao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhaorun Chen</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+Z">Zenghui Ding</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xianjun Yang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yining Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14401v1-abstract-short" style="display: inline;"> Recent advancements in multimodal large language models (MLLMs) have opened new avenues for video understanding. However, achieving high fidelity in zero-shot video tasks remains challenging. Traditional video processing methods rely heavily on fine-tuning to capture nuanced spatial-temporal details, which incurs significant data and computation costs. In contrast, training-free approaches, though… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14401v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14401v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14401v1-abstract-full" style="display: none;"> Recent advancements in multimodal large language models (MLLMs) have opened new avenues for video understanding. However, achieving high fidelity in zero-shot video tasks remains challenging. Traditional video processing methods rely heavily on fine-tuning to capture nuanced spatial-temporal details, which incurs significant data and computation costs. In contrast, training-free approaches, though efficient, often lack robustness in preserving context-rich features across complex video content. To this end, we propose DYTO, a novel dynamic token merging framework for zero-shot video understanding that adaptively optimizes token efficiency while preserving crucial scene details. DYTO integrates a hierarchical frame selection and a bipartite token merging strategy to dynamically cluster key frames and selectively compress token sequences, striking a balance between computational efficiency with semantic richness. Extensive experiments across multiple benchmarks demonstrate the effectiveness of DYTO, achieving superior performance compared to both fine-tuned and training-free methods and setting a new state-of-the-art for zero-shot video understanding. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14401v1-abstract-full').style.display = 'none'; document.getElementById('2411.14401v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14381">arXiv:2411.14381</a> <span> [<a href="https://arxiv.org/pdf/2411.14381">pdf</a>, <a href="https://arxiv.org/format/2411.14381">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> ETA-IK: Execution-Time-Aware Inverse Kinematics for Dual-Arm Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tang%2C+Y">Yucheng Tang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+X">Xi Huang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yongzhou Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tao Chen</a>, <a href="/search/cs?searchtype=author&query=Mamaev%2C+I">Ilshat Mamaev</a>, <a href="/search/cs?searchtype=author&query=Hein%2C+B">Bj枚rn Hein</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14381v1-abstract-short" style="display: inline;"> This paper presents ETA-IK, a novel Execution-Time-Aware Inverse Kinematics method tailored for dual-arm robotic systems. The primary goal is to optimize motion execution time by leveraging the redundancy of both arms, specifically in tasks where only the relative pose of the robots is constrained, such as dual-arm scanning of unknown objects. Unlike traditional inverse kinematics methods that use… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14381v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14381v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14381v1-abstract-full" style="display: none;"> This paper presents ETA-IK, a novel Execution-Time-Aware Inverse Kinematics method tailored for dual-arm robotic systems. The primary goal is to optimize motion execution time by leveraging the redundancy of both arms, specifically in tasks where only the relative pose of the robots is constrained, such as dual-arm scanning of unknown objects. Unlike traditional inverse kinematics methods that use surrogate metrics such as joint configuration distance, our method incorporates direct motion execution time and implicit collisions into the optimization process, thereby finding target joints that allow subsequent trajectory generation to get more efficient and collision-free motion. A neural network based execution time approximator is employed to predict time-efficient joint configurations while accounting for potential collisions. Through experimental evaluation on a system composed of a UR5 and a KUKA iiwa robot, we demonstrate significant reductions in execution time. The proposed method outperforms conventional approaches, showing improved motion efficiency without sacrificing positioning accuracy. These results highlight the potential of ETA-IK to improve the performance of dual-arm systems in applications, where efficiency and safety are paramount. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14381v1-abstract-full').style.display = 'none'; document.getElementById('2411.14381v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14347">arXiv:2411.14347</a> <span> [<a href="https://arxiv.org/pdf/2411.14347">pdf</a>, <a href="https://arxiv.org/format/2411.14347">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> DINO-X: A Unified Vision Model for Open-World Object Detection and Understanding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ren%2C+T">Tianhe Ren</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yihao Chen</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Q">Qing Jiang</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+Z">Zhaoyang Zeng</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+Y">Yuda Xiong</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+W">Wenlong Liu</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Z">Zhengyu Ma</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+J">Junyi Shen</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yuan Gao</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+X">Xiaoke Jiang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xingyu Chen</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Z">Zhuheng Song</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yuhong Zhang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+H">Hongjie Huang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+H">Han Gao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Shilong Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hao Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+F">Feng Li</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+K">Kent Yu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Lei Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14347v1-abstract-short" style="display: inline;"> In this paper, we introduce DINO-X, which is a unified object-centric vision model developed by IDEA Research with the best open-world object detection performance to date. DINO-X employs the same Transformer-based encoder-decoder architecture as Grounding DINO 1.5 to pursue an object-level representation for open-world object understanding. To make long-tailed object detection easy, DINO-X extend… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14347v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14347v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14347v1-abstract-full" style="display: none;"> In this paper, we introduce DINO-X, which is a unified object-centric vision model developed by IDEA Research with the best open-world object detection performance to date. DINO-X employs the same Transformer-based encoder-decoder architecture as Grounding DINO 1.5 to pursue an object-level representation for open-world object understanding. To make long-tailed object detection easy, DINO-X extends its input options to support text prompt, visual prompt, and customized prompt. With such flexible prompt options, we develop a universal object prompt to support prompt-free open-world detection, making it possible to detect anything in an image without requiring users to provide any prompt. To enhance the model's core grounding capability, we have constructed a large-scale dataset with over 100 million high-quality grounding samples, referred to as Grounding-100M, for advancing the model's open-vocabulary detection performance. Pre-training on such a large-scale grounding dataset leads to a foundational object-level representation, which enables DINO-X to integrate multiple perception heads to simultaneously support multiple object perception and understanding tasks, including detection, segmentation, pose estimation, object captioning, object-based QA, etc. Experimental results demonstrate the superior performance of DINO-X. Specifically, the DINO-X Pro model achieves 56.0 AP, 59.8 AP, and 52.4 AP on the COCO, LVIS-minival, and LVIS-val zero-shot object detection benchmarks, respectively. Notably, it scores 63.3 AP and 56.5 AP on the rare classes of LVIS-minival and LVIS-val benchmarks, both improving the previous SOTA performance by 5.8 AP. Such a result underscores its significantly improved capacity for recognizing long-tailed objects. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14347v1-abstract-full').style.display = 'none'; document.getElementById('2411.14347v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Technical Report</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14279">arXiv:2411.14279</a> <span> [<a href="https://arxiv.org/pdf/2411.14279">pdf</a>, <a href="https://arxiv.org/format/2411.14279">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Looking Beyond Text: Reducing Language bias in Large Vision-Language Models via Multimodal Dual-Attention and Soft-Image Guidance </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Haozhe Zhao</a>, <a href="/search/cs?searchtype=author&query=Si%2C+S">Shuzheng Si</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Liang Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yichi Zhang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+M">Maosong Sun</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Mingjia Zhang</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+B">Baobao Chang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14279v1-abstract-short" style="display: inline;"> Large vision-language models (LVLMs) have achieved impressive results in various vision-language tasks. However, despite showing promising performance, LVLMs suffer from hallucinations caused by language bias, leading to diminished focus on images and ineffective visual comprehension. We identify two primary reasons for this bias: 1. Different scales of training data between the pretraining stage… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14279v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14279v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14279v1-abstract-full" style="display: none;"> Large vision-language models (LVLMs) have achieved impressive results in various vision-language tasks. However, despite showing promising performance, LVLMs suffer from hallucinations caused by language bias, leading to diminished focus on images and ineffective visual comprehension. We identify two primary reasons for this bias: 1. Different scales of training data between the pretraining stage of LLM and multimodal alignment stage. 2. The learned inference bias due to short-term dependency of text data. Therefore, we propose LACING, a systemic framework designed to address the language bias of LVLMs with muLtimodal duAl-attention meChanIsm (MDA) aNd soft-image Guidance (IFG). Specifically, MDA introduces a parallel dual-attention mechanism that enhances the integration of visual inputs across the model. IFG introduces a learnable soft visual prompt during training and inference to replace visual inputs, designed to compel LVLMs to prioritize text inputs. Then, IFG further proposes a novel decoding strategy using the soft visual prompt to mitigate the model's over-reliance on adjacent text inputs. Comprehensive experiments demonstrate that our method effectively debiases LVLMs from their language bias, enhancing visual comprehension and reducing hallucinations without requiring additional training resources or data. The code and model are available at [lacing-lvlm.github.io](https://lacing-lvlm.github.io). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14279v1-abstract-full').style.display = 'none'; document.getElementById('2411.14279v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">19 pages, 12 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14250">arXiv:2411.14250</a> <span> [<a href="https://arxiv.org/pdf/2411.14250">pdf</a>, <a href="https://arxiv.org/format/2411.14250">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> CP-UNet: Contour-based Probabilistic Model for Medical Ultrasound Images Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+R">Ruiguo Yu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yiyang Zhang</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+Y">Yuan Tian</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhiqiang Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xuewei Li</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+J">Jie Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14250v1-abstract-short" style="display: inline;"> Deep learning-based segmentation methods are widely utilized for detecting lesions in ultrasound images. Throughout the imaging procedure, the attenuation and scattering of ultrasound waves cause contour blurring and the formation of artifacts, limiting the clarity of the acquired ultrasound images. To overcome this challenge, we propose a contour-based probabilistic segmentation model CP-UNet, wh… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14250v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14250v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14250v1-abstract-full" style="display: none;"> Deep learning-based segmentation methods are widely utilized for detecting lesions in ultrasound images. Throughout the imaging procedure, the attenuation and scattering of ultrasound waves cause contour blurring and the formation of artifacts, limiting the clarity of the acquired ultrasound images. To overcome this challenge, we propose a contour-based probabilistic segmentation model CP-UNet, which guides the segmentation network to enhance its focus on contour during decoding. We design a novel down-sampling module to enable the contour probability distribution modeling and encoding stages to acquire global-local features. Furthermore, the Gaussian Mixture Model utilizes optimized features to model the contour distribution, capturing the uncertainty of lesion boundaries. Extensive experiments with several state-of-the-art deep learning segmentation methods on three ultrasound image datasets show that our method performs better on breast and thyroid lesions segmentation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14250v1-abstract-full').style.display = 'none'; document.getElementById('2411.14250v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">4 pages, 4 figures, 2 tables;For icassp2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14141">arXiv:2411.14141</a> <span> [<a href="https://arxiv.org/pdf/2411.14141">pdf</a>, <a href="https://arxiv.org/format/2411.14141">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Numerical Analysis">math.NA</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Differentiable SVD based on Moore-Penrose Pseudoinverse for Inverse Imaging Problems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yinghao Zhang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yue Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14141v1-abstract-short" style="display: inline;"> Low-rank regularization-based deep unrolling networks have achieved remarkable success in various inverse imaging problems (IIPs). However, the singular value decomposition (SVD) is non-differentiable when duplicated singular values occur, leading to severe numerical instability during training. In this paper, we propose a differentiable SVD based on the Moore-Penrose pseudoinverse to address this… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14141v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14141v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14141v1-abstract-full" style="display: none;"> Low-rank regularization-based deep unrolling networks have achieved remarkable success in various inverse imaging problems (IIPs). However, the singular value decomposition (SVD) is non-differentiable when duplicated singular values occur, leading to severe numerical instability during training. In this paper, we propose a differentiable SVD based on the Moore-Penrose pseudoinverse to address this issue. To the best of our knowledge, this is the first work to provide a comprehensive analysis of the differentiability of the trivial SVD. Specifically, we show that the non-differentiability of SVD is essentially due to an underdetermined system of linear equations arising in the derivation process. We utilize the Moore-Penrose pseudoinverse to solve the system, thereby proposing a differentiable SVD. A numerical stability analysis in the context of IIPs is provided. Experimental results in color image compressed sensing and dynamic MRI reconstruction show that our proposed differentiable SVD can effectively address the numerical instability issue while ensuring computational precision. Code is available at https://github.com/yhao-z/SVD-inv. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14141v1-abstract-full').style.display = 'none'; document.getElementById('2411.14141v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> G.1.4; I.2.0; I.4.4; I.4.5 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14053">arXiv:2411.14053</a> <span> [<a href="https://arxiv.org/pdf/2411.14053">pdf</a>, <a href="https://arxiv.org/format/2411.14053">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Stereo Anything: Unifying Stereo Matching with Large-Scale Mixed Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+X">Xianda Guo</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chenming Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Youmin Zhang</a>, <a href="/search/cs?searchtype=author&query=Nie%2C+D">Dujun Nie</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+R">Ruilin Wang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+W">Wenzhao Zheng</a>, <a href="/search/cs?searchtype=author&query=Poggi%2C+M">Matteo Poggi</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Long Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14053v1-abstract-short" style="display: inline;"> Stereo matching has been a pivotal component in 3D vision, aiming to find corresponding points between pairs of stereo images to recover depth information. In this work, we introduce StereoAnything, a highly practical solution for robust stereo matching. Rather than focusing on a specialized model, our goal is to develop a versatile foundational model capable of handling stereo images across diver… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14053v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14053v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14053v1-abstract-full" style="display: none;"> Stereo matching has been a pivotal component in 3D vision, aiming to find corresponding points between pairs of stereo images to recover depth information. In this work, we introduce StereoAnything, a highly practical solution for robust stereo matching. Rather than focusing on a specialized model, our goal is to develop a versatile foundational model capable of handling stereo images across diverse environments. To this end, we scale up the dataset by collecting labeled stereo images and generating synthetic stereo pairs from unlabeled monocular images. To further enrich the model's ability to generalize across different conditions, we introduce a novel synthetic dataset that complements existing data by adding variability in baselines, camera angles, and scene types. We extensively evaluate the zero-shot capabilities of our model on five public datasets, showcasing its impressive ability to generalize to new, unseen data. Code will be available at \url{https://github.com/XiandaGuo/OpenStereo}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14053v1-abstract-full').style.display = 'none'; document.getElementById('2411.14053v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Code will be available at \url{https://github.com/XiandaGuo/OpenStereo}</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13900">arXiv:2411.13900</a> <span> [<a href="https://arxiv.org/pdf/2411.13900">pdf</a>, <a href="https://arxiv.org/format/2411.13900">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> </div> </div> <p class="title is-5 mathjax"> Dissecting Conditional Branch Predictors of Apple Firestorm and Qualcomm Oryon for Software Optimization and Architectural Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jiajie Chen</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+P">Peng Qu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Youhui Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13900v1-abstract-short" style="display: inline;"> Branch predictor (BP) is a critical component of modern processors, and its accurate modeling is essential for compilers and applications. However, processor vendors have disclosed limited details about their BP implementations. Recent advancements in reverse engineering the BP of general-purpose processors have enabled the creation of more accurate BP models. Nonetheless, we have identified cri… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13900v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13900v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13900v1-abstract-full" style="display: none;"> Branch predictor (BP) is a critical component of modern processors, and its accurate modeling is essential for compilers and applications. However, processor vendors have disclosed limited details about their BP implementations. Recent advancements in reverse engineering the BP of general-purpose processors have enabled the creation of more accurate BP models. Nonetheless, we have identified critical deficiencies in the existing methods. For instance, they impose strong assumptions on the branch history update function and the index/tag functions of key BP components, limiting their applicability to a broader range of processors, including those from Apple and Qualcomm. In this paper, we design a more general branch prediction reverse engineering pipeline that can additionally recover the conditional branch predictors (CBPs) of Apple Firestorm and Qualcomm Oryon microarchitectures, and subsequently build accurate CBP models. Leveraging these models, we uncover two previously undisclosed effects that impair branch prediction accuracy and propose related solutions, resulting in up to 14% MPKI reduction and 7% performance improvement in representative applications. Furthermore, we conduct a comprehensive comparison of the known Intel/Apple/Qualcomm CBPs using a unified standalone branch predictor simulator, which facilitates a deeper understanding of CBP behavior. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13900v1-abstract-full').style.display = 'none'; document.getElementById('2411.13900v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13851">arXiv:2411.13851</a> <span> [<a href="https://arxiv.org/pdf/2411.13851">pdf</a>, <a href="https://arxiv.org/format/2411.13851">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Arm Robot: AR-Enhanced Embodied Control and Visualization for Intuitive Robot Arm Manipulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Pei%2C+S">Siyou Pei</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+A">Alexander Chen</a>, <a href="/search/cs?searchtype=author&query=Kaoshik%2C+R">Ronak Kaoshik</a>, <a href="/search/cs?searchtype=author&query=Du%2C+R">Ruofei Du</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yang Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13851v1-abstract-short" style="display: inline;"> Embodied interaction has been introduced to human-robot interaction (HRI) as a type of teleoperation, in which users control robot arms with bodily action via handheld controllers or haptic gloves. Embodied teleoperation has made robot control intuitive to non-technical users, but differences between humans' and robots' capabilities \eg ranges of motion and response time, remain challenging. In re… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13851v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13851v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13851v1-abstract-full" style="display: none;"> Embodied interaction has been introduced to human-robot interaction (HRI) as a type of teleoperation, in which users control robot arms with bodily action via handheld controllers or haptic gloves. Embodied teleoperation has made robot control intuitive to non-technical users, but differences between humans' and robots' capabilities \eg ranges of motion and response time, remain challenging. In response, we present Arm Robot, an embodied robot arm teleoperation system that helps users tackle human-robot discrepancies. Specifically, Arm Robot (1) includes AR visualization as real-time feedback on temporal and spatial discrepancies, and (2) allows users to change observing perspectives and expand action space. We conducted a user study (N=18) to investigate the usability of the Arm Robot and learn how users perceive the embodiment. Our results show users could use Arm Robot's features to effectively control the robot arm, providing insights for continued work in embodied HRI. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13851v1-abstract-full').style.display = 'none'; document.getElementById('2411.13851v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13775">arXiv:2411.13775</a> <span> [<a href="https://arxiv.org/pdf/2411.13775">pdf</a>, <a href="https://arxiv.org/format/2411.13775">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Benchmarking GPT-4 against Human Translators: A Comprehensive Evaluation Across Languages, Domains, and Expertise Levels </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yan%2C+J">Jianhao Yan</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+P">Pingchuan Yan</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yulong Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jing Li</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+X">Xianchao Zhu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yue Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13775v1-abstract-short" style="display: inline;"> This study presents a comprehensive evaluation of GPT-4's translation capabilities compared to human translators of varying expertise levels. Through systematic human evaluation using the MQM schema, we assess translations across three language pairs (Chinese$\longleftrightarrow$English, Russian$\longleftrightarrow$English, and Chinese$\longleftrightarrow$Hindi) and three domains (News, Technology… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13775v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13775v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13775v1-abstract-full" style="display: none;"> This study presents a comprehensive evaluation of GPT-4's translation capabilities compared to human translators of varying expertise levels. Through systematic human evaluation using the MQM schema, we assess translations across three language pairs (Chinese$\longleftrightarrow$English, Russian$\longleftrightarrow$English, and Chinese$\longleftrightarrow$Hindi) and three domains (News, Technology, and Biomedical). Our findings reveal that GPT-4 achieves performance comparable to junior-level translators in terms of total errors, while still lagging behind senior translators. Unlike traditional Neural Machine Translation systems, which show significant performance degradation in resource-poor language directions, GPT-4 maintains consistent translation quality across all evaluated language pairs. Through qualitative analysis, we identify distinctive patterns in translation approaches: GPT-4 tends toward overly literal translations and exhibits lexical inconsistency, while human translators sometimes over-interpret context and introduce hallucinations. This study represents the first systematic comparison between LLM and human translators across different proficiency levels, providing valuable insights into the current capabilities and limitations of LLM-based translation systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13775v1-abstract-full').style.display = 'none'; document.getElementById('2411.13775v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Work in progress</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13632">arXiv:2411.13632</a> <span> [<a href="https://arxiv.org/pdf/2411.13632">pdf</a>, <a href="https://arxiv.org/format/2411.13632">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ID-Patch: Robust ID Association for Group Photo Personalization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yimeng Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhi%2C+T">Tiancheng Zhi</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jing Liu</a>, <a href="/search/cs?searchtype=author&query=Sang%2C+S">Shen Sang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+L">Liming Jiang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+Q">Qing Yan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Sijia Liu</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+L">Linjie Luo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13632v1-abstract-short" style="display: inline;"> The ability to synthesize personalized group photos and specify the positions of each identity offers immense creative potential. While such imagery can be visually appealing, it presents significant challenges for existing technologies. A persistent issue is identity (ID) leakage, where injected facial features interfere with one another, resulting in low face resemblance, incorrect positioning,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13632v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13632v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13632v1-abstract-full" style="display: none;"> The ability to synthesize personalized group photos and specify the positions of each identity offers immense creative potential. While such imagery can be visually appealing, it presents significant challenges for existing technologies. A persistent issue is identity (ID) leakage, where injected facial features interfere with one another, resulting in low face resemblance, incorrect positioning, and visual artifacts. Existing methods suffer from limitations such as the reliance on segmentation models, increased runtime, or a high probability of ID leakage. To address these challenges, we propose ID-Patch, a novel method that provides robust association between identities and 2D positions. Our approach generates an ID patch and ID embeddings from the same facial features: the ID patch is positioned on the conditional image for precise spatial control, while the ID embeddings integrate with text embeddings to ensure high resemblance. Experimental results demonstrate that ID-Patch surpasses baseline methods across metrics, such as face ID resemblance, ID-position association accuracy, and generation efficiency. Project Page is: https://byteaigc.github.io/ID-Patch/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13632v1-abstract-full').style.display = 'none'; document.getElementById('2411.13632v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project Page is: https://byteaigc.github.io/ID-Patch/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13504">arXiv:2411.13504</a> <span> [<a href="https://arxiv.org/pdf/2411.13504">pdf</a>, <a href="https://arxiv.org/format/2411.13504">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Disentangling Memory and Reasoning Ability in Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jin%2C+M">Mingyu Jin</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+W">Weidi Luo</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+S">Sitao Cheng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xinyi Wang</a>, <a href="/search/cs?searchtype=author&query=Hua%2C+W">Wenyue Hua</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+R">Ruixiang Tang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W+Y">William Yang Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yongfeng Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13504v2-abstract-short" style="display: inline;"> Large Language Models (LLMs) have demonstrated strong performance in handling complex tasks requiring both extensive knowledge and reasoning abilities. However, the existing LLM inference pipeline operates as an opaque process without explicit separation between knowledge retrieval and reasoning steps, making the model's decision-making process unclear and disorganized. This ambiguity can lead to… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13504v2-abstract-full').style.display = 'inline'; document.getElementById('2411.13504v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13504v2-abstract-full" style="display: none;"> Large Language Models (LLMs) have demonstrated strong performance in handling complex tasks requiring both extensive knowledge and reasoning abilities. However, the existing LLM inference pipeline operates as an opaque process without explicit separation between knowledge retrieval and reasoning steps, making the model's decision-making process unclear and disorganized. This ambiguity can lead to issues such as hallucinations and knowledge forgetting, which significantly impact the reliability of LLMs in high-stakes domains. In this paper, we propose a new inference paradigm that decomposes the complex inference process into two distinct and clear actions: (1) memory recall: which retrieves relevant knowledge, and (2) reasoning: which performs logical steps based on the recalled knowledge. To facilitate this decomposition, we introduce two special tokens memory and reason, guiding the model to distinguish between steps that require knowledge retrieval and those that involve reasoning. Our experiment results show that this decomposition not only improves model performance but also enhances the interpretability of the inference process, enabling users to identify sources of error and refine model responses effectively. The code is available at https://github.com/MingyuJ666/Disentangling-Memory-and-Reasoning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13504v2-abstract-full').style.display = 'none'; document.getElementById('2411.13504v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13420">arXiv:2411.13420</a> <span> [<a href="https://arxiv.org/pdf/2411.13420">pdf</a>, <a href="https://arxiv.org/format/2411.13420">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Heuristically Adaptive Diffusion-Model Evolutionary Strategy </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hartl%2C+B">Benedikt Hartl</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yanbo Zhang</a>, <a href="/search/cs?searchtype=author&query=Hazan%2C+H">Hananel Hazan</a>, <a href="/search/cs?searchtype=author&query=Levin%2C+M">Michael Levin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13420v1-abstract-short" style="display: inline;"> Diffusion Models represent a significant advancement in generative modeling, employing a dual-phase process that first degrades domain-specific information via Gaussian noise and restores it through a trainable model. This framework enables pure noise-to-data generation and modular reconstruction of, images or videos. Concurrently, evolutionary algorithms employ optimization methods inspired by bi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13420v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13420v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13420v1-abstract-full" style="display: none;"> Diffusion Models represent a significant advancement in generative modeling, employing a dual-phase process that first degrades domain-specific information via Gaussian noise and restores it through a trainable model. This framework enables pure noise-to-data generation and modular reconstruction of, images or videos. Concurrently, evolutionary algorithms employ optimization methods inspired by biological principles to refine sets of numerical parameters encoding potential solutions to rugged objective functions. Our research reveals a fundamental connection between diffusion models and evolutionary algorithms through their shared underlying generative mechanisms: both methods generate high-quality samples via iterative refinement on random initial distributions. By employing deep learning-based diffusion models as generative models across diverse evolutionary tasks and iteratively refining diffusion models with heuristically acquired databases, we can iteratively sample potentially better-adapted offspring parameters, integrating them into successive generations of the diffusion model. This approach achieves efficient convergence toward high-fitness parameters while maintaining explorative diversity. Diffusion models introduce enhanced memory capabilities into evolutionary algorithms, retaining historical information across generations and leveraging subtle data correlations to generate refined samples. We elevate evolutionary algorithms from procedures with shallow heuristics to frameworks with deep memory. By deploying classifier-free guidance for conditional sampling at the parameter level, we achieve precise control over evolutionary search dynamics to further specific genotypical, phenotypical, or population-wide traits. Our framework marks a major heuristic and algorithmic transition, offering increased flexibility, precision, and control in evolutionary optimization processes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13420v1-abstract-full').style.display = 'none'; document.getElementById('2411.13420v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13405">arXiv:2411.13405</a> <span> [<a href="https://arxiv.org/pdf/2411.13405">pdf</a>, <a href="https://arxiv.org/format/2411.13405">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> On the Way to LLM Personalization: Learning to Remember User Conversations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Magister%2C+L+C">Lucie Charlotte Magister</a>, <a href="/search/cs?searchtype=author&query=Metcalf%2C+K">Katherine Metcalf</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yizhe Zhang</a>, <a href="/search/cs?searchtype=author&query=ter+Hoeve%2C+M">Maartje ter Hoeve</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13405v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) have quickly become an invaluable assistant for a variety of tasks. However, their effectiveness is constrained by their ability to tailor responses to human preferences and behaviors via personalization. Prior work in LLM personalization has largely focused on style transfer or incorporating small factoids about the user, as knowledge injection remains an open challen… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13405v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13405v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13405v1-abstract-full" style="display: none;"> Large Language Models (LLMs) have quickly become an invaluable assistant for a variety of tasks. However, their effectiveness is constrained by their ability to tailor responses to human preferences and behaviors via personalization. Prior work in LLM personalization has largely focused on style transfer or incorporating small factoids about the user, as knowledge injection remains an open challenge. In this paper, we explore injecting knowledge of prior conversations into LLMs to enable future work on less redundant, personalized conversations. We identify two real-world constraints: (1) conversations are sequential in time and must be treated as such during training, and (2) per-user personalization is only viable in parameter-efficient settings. To this aim, we propose PLUM, a pipeline performing data augmentation for up-sampling conversations as question-answer pairs, that are then used to finetune a low-rank adaptation adapter with a weighted cross entropy loss. Even in this first exploration of the problem, we perform competitively with baselines such as RAG, attaining an accuracy of 81.5% across 100 conversations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13405v1-abstract-full').style.display = 'none'; document.getElementById('2411.13405v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">16 pages, 6 tables, 3 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13162">arXiv:2411.13162</a> <span> [<a href="https://arxiv.org/pdf/2411.13162">pdf</a>, <a href="https://arxiv.org/format/2411.13162">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> </div> </div> <p class="title is-5 mathjax"> IC Mechanisms for Risk-Averse Advertisers in the Online Advertising System </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bingzhe Wang</a>, <a href="/search/cs?searchtype=author&query=Qian%2C+R">Ruohan Qian</a>, <a href="/search/cs?searchtype=author&query=Dou%2C+Y">Yuejia Dou</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+Q">Qi Qi</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+B">Bo Shen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Changyuan Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yixuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Su%2C+Y">Yixin Su</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+X">Xin Yuan</a>, <a href="/search/cs?searchtype=author&query=liu%2C+W">Wenqiang liu</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+B">Bin Zou</a>, <a href="/search/cs?searchtype=author&query=Yi%2C+W">Wen Yi</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Z">Zhi Guo</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shuanglong Li</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+L">Liu Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13162v1-abstract-short" style="display: inline;"> The autobidding system generates huge revenue for advertising platforms, garnering substantial research attention. Existing studies in autobidding systems focus on designing Autobidding Incentive Compatible (AIC) mechanisms, where the mechanism is Incentive Compatible (IC) under ex ante expectations. However, upon deploying AIC mechanisms in advertising platforms, we observe a notable deviation be… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13162v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13162v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13162v1-abstract-full" style="display: none;"> The autobidding system generates huge revenue for advertising platforms, garnering substantial research attention. Existing studies in autobidding systems focus on designing Autobidding Incentive Compatible (AIC) mechanisms, where the mechanism is Incentive Compatible (IC) under ex ante expectations. However, upon deploying AIC mechanisms in advertising platforms, we observe a notable deviation between the actual auction outcomes and these expectations during runtime, particularly in the scene with few clicks (sparse-click). This discrepancy undermines truthful bidding among advertisers in AIC mechanisms, especially for risk-averse advertisers who are averse to outcomes that do not align with the expectations. To address this issue, we propose a mechanism, Decoupled First-Price Auction (DFP), that retains its IC property even during runtime. DFP dynamically adjusts the payment based on real-time user conversion outcomes, ensuring that advertisers' realized utilities closely approximate their expected utilities during runtime. To realize the payment mechanism of DFP, we propose a PPO-based RL algorithm, with a meticulously crafted reward function. This algorithm dynamically adjusts the payment to fit DFP mechanism. We conduct extensive experiments leveraging real-world data to validate our findings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13162v1-abstract-full').style.display = 'none'; document.getElementById('2411.13162v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13154">arXiv:2411.13154</a> <span> [<a href="https://arxiv.org/pdf/2411.13154">pdf</a>, <a href="https://arxiv.org/format/2411.13154">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> DMQR-RAG: Diverse Multi-Query Rewriting for RAG </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhicong Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jiahao Wang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Zhishu Jiang</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+H">Hangyu Mao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhongxia Chen</a>, <a href="/search/cs?searchtype=author&query=Du%2C+J">Jiazhen Du</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yuanxing Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+F">Fuzheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+D">Di Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yong Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13154v1-abstract-short" style="display: inline;"> Large language models often encounter challenges with static knowledge and hallucinations, which undermine their reliability. Retrieval-augmented generation (RAG) mitigates these issues by incorporating external information. However, user queries frequently contain noise and intent deviations, necessitating query rewriting to improve the relevance of retrieved documents. In this paper, we introduc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13154v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13154v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13154v1-abstract-full" style="display: none;"> Large language models often encounter challenges with static knowledge and hallucinations, which undermine their reliability. Retrieval-augmented generation (RAG) mitigates these issues by incorporating external information. However, user queries frequently contain noise and intent deviations, necessitating query rewriting to improve the relevance of retrieved documents. In this paper, we introduce DMQR-RAG, a Diverse Multi-Query Rewriting framework designed to improve the performance of both document retrieval and final responses in RAG. Specifically, we investigate how queries with varying information quantities can retrieve a diverse array of documents, presenting four rewriting strategies that operate at different levels of information to enhance the performance of baseline approaches. Additionally, we propose an adaptive strategy selection method that minimizes the number of rewrites while optimizing overall performance. Our methods have been rigorously validated through extensive experiments conducted in both academic and industry settings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13154v1-abstract-full').style.display = 'none'; document.getElementById('2411.13154v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13120">arXiv:2411.13120</a> <span> [<a href="https://arxiv.org/pdf/2411.13120">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Medical Physics">physics.med-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optics">physics.optics</span> </div> </div> <p class="title is-5 mathjax"> Virtual Staining of Label-Free Tissue in Imaging Mass Spectrometry </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yijie Zhang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+L">Luzhe Huang</a>, <a href="/search/cs?searchtype=author&query=Pillar%2C+N">Nir Pillar</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yuzhu Li</a>, <a href="/search/cs?searchtype=author&query=Migas%2C+L+G">Lukasz G. Migas</a>, <a href="/search/cs?searchtype=author&query=Van+de+Plas%2C+R">Raf Van de Plas</a>, <a href="/search/cs?searchtype=author&query=Spraggins%2C+J+M">Jeffrey M. Spraggins</a>, <a href="/search/cs?searchtype=author&query=Ozcan%2C+A">Aydogan Ozcan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13120v1-abstract-short" style="display: inline;"> Imaging mass spectrometry (IMS) is a powerful tool for untargeted, highly multiplexed molecular mapping of tissue in biomedical research. IMS offers a means of mapping the spatial distributions of molecular species in biological tissue with unparalleled chemical specificity and sensitivity. However, most IMS platforms are not able to achieve microscopy-level spatial resolution and lack cellular mo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13120v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13120v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13120v1-abstract-full" style="display: none;"> Imaging mass spectrometry (IMS) is a powerful tool for untargeted, highly multiplexed molecular mapping of tissue in biomedical research. IMS offers a means of mapping the spatial distributions of molecular species in biological tissue with unparalleled chemical specificity and sensitivity. However, most IMS platforms are not able to achieve microscopy-level spatial resolution and lack cellular morphological contrast, necessitating subsequent histochemical staining, microscopic imaging and advanced image registration steps to enable molecular distributions to be linked to specific tissue features and cell types. Here, we present a virtual histological staining approach that enhances spatial resolution and digitally introduces cellular morphological contrast into mass spectrometry images of label-free human tissue using a diffusion model. Blind testing on human kidney tissue demonstrated that the virtually stained images of label-free samples closely match their histochemically stained counterparts (with Periodic Acid-Schiff staining), showing high concordance in identifying key renal pathology structures despite utilizing IMS data with 10-fold larger pixel size. Additionally, our approach employs an optimized noise sampling technique during the diffusion model's inference process to reduce variance in the generated images, yielding reliable and repeatable virtual staining. We believe this virtual staining method will significantly expand the applicability of IMS in life sciences and open new avenues for mass spectrometry-based biomedical research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13120v1-abstract-full').style.display = 'none'; document.getElementById('2411.13120v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">33 Pages, 6 Figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13053">arXiv:2411.13053</a> <span> [<a href="https://arxiv.org/pdf/2411.13053">pdf</a>, <a href="https://arxiv.org/format/2411.13053">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> MEGL: Multimodal Explanation-Guided Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yifei Zhang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+T">Tianxu Jiang</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+B">Bo Pan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jingyu Wang</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+G">Guangji Bai</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+L">Liang Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13053v1-abstract-short" style="display: inline;"> Explaining the decision-making processes of Artificial Intelligence (AI) models is crucial for addressing their "black box" nature, particularly in tasks like image classification. Traditional eXplainable AI (XAI) methods typically rely on unimodal explanations, either visual or textual, each with inherent limitations. Visual explanations highlight key regions but often lack rationale, while textu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13053v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13053v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13053v1-abstract-full" style="display: none;"> Explaining the decision-making processes of Artificial Intelligence (AI) models is crucial for addressing their "black box" nature, particularly in tasks like image classification. Traditional eXplainable AI (XAI) methods typically rely on unimodal explanations, either visual or textual, each with inherent limitations. Visual explanations highlight key regions but often lack rationale, while textual explanations provide context without spatial grounding. Further, both explanation types can be inconsistent or incomplete, limiting their reliability. To address these challenges, we propose a novel Multimodal Explanation-Guided Learning (MEGL) framework that leverages both visual and textual explanations to enhance model interpretability and improve classification performance. Our Saliency-Driven Textual Grounding (SDTG) approach integrates spatial information from visual explanations into textual rationales, providing spatially grounded and contextually rich explanations. Additionally, we introduce Textual Supervision on Visual Explanations to align visual explanations with textual rationales, even in cases where ground truth visual annotations are missing. A Visual Explanation Distribution Consistency loss further reinforces visual coherence by aligning the generated visual explanations with dataset-level patterns, enabling the model to effectively learn from incomplete multimodal supervision. We validate MEGL on two new datasets, Object-ME and Action-ME, for image classification with multimodal explanations. Experimental results demonstrate that MEGL outperforms previous approaches in prediction accuracy and explanation quality across both visual and textual domains. Our code will be made available upon the acceptance of the paper. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13053v1-abstract-full').style.display = 'none'; document.getElementById('2411.13053v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12967">arXiv:2411.12967</a> <span> [<a href="https://arxiv.org/pdf/2411.12967">pdf</a>, <a href="https://arxiv.org/format/2411.12967">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Shrinking POMCP: A Framework for Real-Time UAV Search and Rescue </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yunuo Zhang</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+B">Baiting Luo</a>, <a href="/search/cs?searchtype=author&query=Mukhopadhyay%2C+A">Ayan Mukhopadhyay</a>, <a href="/search/cs?searchtype=author&query=Stojcsics%2C+D">Daniel Stojcsics</a>, <a href="/search/cs?searchtype=author&query=Elenius%2C+D">Daniel Elenius</a>, <a href="/search/cs?searchtype=author&query=Roy%2C+A">Anirban Roy</a>, <a href="/search/cs?searchtype=author&query=Jha%2C+S">Susmit Jha</a>, <a href="/search/cs?searchtype=author&query=Maroti%2C+M">Miklos Maroti</a>, <a href="/search/cs?searchtype=author&query=Koutsoukos%2C+X">Xenofon Koutsoukos</a>, <a href="/search/cs?searchtype=author&query=Karsai%2C+G">Gabor Karsai</a>, <a href="/search/cs?searchtype=author&query=Dubey%2C+A">Abhishek Dubey</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12967v1-abstract-short" style="display: inline;"> Efficient path optimization for drones in search and rescue operations faces challenges, including limited visibility, time constraints, and complex information gathering in urban environments. We present a comprehensive approach to optimize UAV-based search and rescue operations in neighborhood areas, utilizing both a 3D AirSim-ROS2 simulator and a 2D simulator. The path planning problem is formu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12967v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12967v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12967v1-abstract-full" style="display: none;"> Efficient path optimization for drones in search and rescue operations faces challenges, including limited visibility, time constraints, and complex information gathering in urban environments. We present a comprehensive approach to optimize UAV-based search and rescue operations in neighborhood areas, utilizing both a 3D AirSim-ROS2 simulator and a 2D simulator. The path planning problem is formulated as a partially observable Markov decision process (POMDP), and we propose a novel ``Shrinking POMCP'' approach to address time constraints. In the AirSim environment, we integrate our approach with a probabilistic world model for belief maintenance and a neurosymbolic navigator for obstacle avoidance. The 2D simulator employs surrogate ROS2 nodes with equivalent functionality. We compare trajectories generated by different approaches in the 2D simulator and evaluate performance across various belief types in the 3D AirSim-ROS simulator. Experimental results from both simulators demonstrate that our proposed shrinking POMCP solution achieves significant improvements in search times compared to alternative methods, showcasing its potential for enhancing the efficiency of UAV-assisted search and rescue operations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12967v1-abstract-full').style.display = 'none'; document.getElementById('2411.12967v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to the The 3rd International Conference on Assured Autonomy</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12773">arXiv:2411.12773</a> <span> [<a href="https://arxiv.org/pdf/2411.12773">pdf</a>, <a href="https://arxiv.org/format/2411.12773">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Decoupling Training-Free Guided Diffusion by ADMM </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Youyuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zehua Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zenan Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhaoyu Li</a>, <a href="/search/cs?searchtype=author&query=Clark%2C+J+J">James J. Clark</a>, <a href="/search/cs?searchtype=author&query=Si%2C+X">Xujie Si</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12773v1-abstract-short" style="display: inline;"> In this paper, we consider the conditional generation problem by guiding off-the-shelf unconditional diffusion models with differentiable loss functions in a plug-and-play fashion. While previous research has primarily focused on balancing the unconditional diffusion model and the guided loss through a tuned weight hyperparameter, we propose a novel framework that distinctly decouples these two co… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12773v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12773v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12773v1-abstract-full" style="display: none;"> In this paper, we consider the conditional generation problem by guiding off-the-shelf unconditional diffusion models with differentiable loss functions in a plug-and-play fashion. While previous research has primarily focused on balancing the unconditional diffusion model and the guided loss through a tuned weight hyperparameter, we propose a novel framework that distinctly decouples these two components. Specifically, we introduce two variables ${x}$ and ${z}$, to represent the generated samples governed by the unconditional generation model and the guidance function, respectively. This decoupling reformulates conditional generation into two manageable subproblems, unified by the constraint ${x} = {z}$. Leveraging this setup, we develop a new algorithm based on the Alternating Direction Method of Multipliers (ADMM) to adaptively balance these components. Additionally, we establish the equivalence between the diffusion reverse step and the proximal operator of ADMM and provide a detailed convergence analysis of our algorithm under certain mild assumptions. Our experiments demonstrate that our proposed method ADMMDiff consistently generates high-quality samples while ensuring strong adherence to the conditioning criteria. It outperforms existing methods across a range of conditional generation tasks, including image generation with various guidance and controllable motion synthesis. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12773v1-abstract-full').style.display = 'none'; document.getElementById('2411.12773v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12759">arXiv:2411.12759</a> <span> [<a href="https://arxiv.org/pdf/2411.12759">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> A Novel Approach to Eliminating Hallucinations in Large Language Model-Assisted Causal Discovery </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sng%2C+G">Grace Sng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yanming Zhang</a>, <a href="/search/cs?searchtype=author&query=Mueller%2C+K">Klaus Mueller</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12759v1-abstract-short" style="display: inline;"> The increasing use of large language models (LLMs) in causal discovery as a substitute for human domain experts highlights the need for optimal model selection. This paper presents the first hallucination survey of popular LLMs for causal discovery. We show that hallucinations exist when using LLMs in causal discovery so the choice of LLM is important. We propose using Retrieval Augmented Generati… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12759v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12759v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12759v1-abstract-full" style="display: none;"> The increasing use of large language models (LLMs) in causal discovery as a substitute for human domain experts highlights the need for optimal model selection. This paper presents the first hallucination survey of popular LLMs for causal discovery. We show that hallucinations exist when using LLMs in causal discovery so the choice of LLM is important. We propose using Retrieval Augmented Generation (RAG) to reduce hallucinations when quality data is available. Additionally, we introduce a novel method employing multiple LLMs with an arbiter in a debate to audit edges in causal graphs, achieving a comparable reduction in hallucinations to RAG. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12759v1-abstract-full').style.display = 'none'; document.getElementById('2411.12759v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12701">arXiv:2411.12701</a> <span> [<a href="https://arxiv.org/pdf/2411.12701">pdf</a>, <a href="https://arxiv.org/format/2411.12701">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> When Backdoors Speak: Understanding LLM Backdoor Attacks Through Model-Generated Explanations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ge%2C+H">Huaizhi Ge</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yiming Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qifan Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yongfeng Zhang</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+R">Ruixiang Tang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12701v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) are vulnerable to backdoor attacks, where hidden triggers can maliciously manipulate model behavior. While several backdoor attack methods have been proposed, the mechanisms by which backdoor functions operate in LLMs remain underexplored. In this paper, we move beyond attacking LLMs and investigate backdoor functionality through the novel lens of natural language expl… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12701v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12701v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12701v1-abstract-full" style="display: none;"> Large Language Models (LLMs) are vulnerable to backdoor attacks, where hidden triggers can maliciously manipulate model behavior. While several backdoor attack methods have been proposed, the mechanisms by which backdoor functions operate in LLMs remain underexplored. In this paper, we move beyond attacking LLMs and investigate backdoor functionality through the novel lens of natural language explanations. Specifically, we leverage LLMs' generative capabilities to produce human-understandable explanations for their decisions, allowing us to compare explanations for clean and poisoned samples. We explore various backdoor attacks and embed the backdoor into LLaMA models for multiple tasks. Our experiments show that backdoored models produce higher-quality explanations for clean data compared to poisoned data, while generating significantly more consistent explanations for poisoned data than for clean data. We further analyze the explanation generation process, revealing that at the token level, the explanation token of poisoned samples only appears in the final few transformer layers of the LLM. At the sentence level, attention dynamics indicate that poisoned inputs shift attention from the input context when generating the explanation. These findings deepen our understanding of backdoor attack mechanisms in LLMs and offer a framework for detecting such vulnerabilities through explainability techniques, contributing to the development of more secure LLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12701v1-abstract-full').style.display = 'none'; document.getElementById('2411.12701v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12689">arXiv:2411.12689</a> <span> [<a href="https://arxiv.org/pdf/2411.12689">pdf</a>, <a href="https://arxiv.org/format/2411.12689">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> IMUVIE: Pickup Timeline Action Localization via Motion Movies </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Clapham%2C+J">John Clapham</a>, <a href="/search/cs?searchtype=author&query=Koltermann%2C+K">Kenneth Koltermann</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yanfu Zhang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yuming Sun</a>, <a href="/search/cs?searchtype=author&query=Burnet%2C+E+N">Evie N Burnet</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+G">Gang Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12689v1-abstract-short" style="display: inline;"> Falls among seniors due to difficulties with tasks such as picking up objects pose significant health and safety risks, impacting quality of life and independence. Reliable, accessible assessment tools are critical for early intervention but often require costly clinic-based equipment and trained personnel, limiting their use in daily life. Existing wearable-based pickup measurement solutions addr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12689v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12689v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12689v1-abstract-full" style="display: none;"> Falls among seniors due to difficulties with tasks such as picking up objects pose significant health and safety risks, impacting quality of life and independence. Reliable, accessible assessment tools are critical for early intervention but often require costly clinic-based equipment and trained personnel, limiting their use in daily life. Existing wearable-based pickup measurement solutions address some needs but face limitations in generalizability. We present IMUVIE, a wearable system that uses motion movies and a machine-learning model to automatically detect and measure pickup events, providing a practical solution for frequent monitoring. IMUVIE's design principles-data normalization, occlusion handling, and streamlined visuals-enhance model performance and are adaptable to tasks beyond pickup classification. In rigorous leave one subject out cross validation evaluations, IMUVIE achieves exceptional window level localization accuracy of 91-92% for pickup action classification on 256,291 motion movie frame candidates while maintaining an event level recall of 97% when evaluated on 129 pickup events. IMUVIE has strong generalization and performs well on unseen subjects. In an interview survey, IMUVIE demonstrated strong user interest and trust, with ease of use identified as the most critical factor for adoption. IMUVIE offers a practical, at-home solution for fall risk assessment, facilitating early detection of movement deterioration, and supporting safer, independent living for seniors. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12689v1-abstract-full').style.display = 'none'; document.getElementById('2411.12689v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This is a preprint version, 12 pages, 20 figures, 3 tables</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2; I.5.4; I.5.5; I.5.2; I.5; I.2.10; I.2.1; I.2.9; I.4.9; J.3; J.7 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12641">arXiv:2411.12641</a> <span> [<a href="https://arxiv.org/pdf/2411.12641">pdf</a>, <a href="https://arxiv.org/format/2411.12641">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Improving Controllability and Editability for Pretrained Text-to-Music Generation Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yixiao Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12641v2-abstract-short" style="display: inline;"> The field of AI-assisted music creation has made significant strides, yet existing systems often struggle to meet the demands of iterative and nuanced music production. These challenges include providing sufficient control over the generated content and allowing for flexible, precise edits. This thesis tackles these issues by introducing a series of advancements that progressively build upon each… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12641v2-abstract-full').style.display = 'inline'; document.getElementById('2411.12641v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12641v2-abstract-full" style="display: none;"> The field of AI-assisted music creation has made significant strides, yet existing systems often struggle to meet the demands of iterative and nuanced music production. These challenges include providing sufficient control over the generated content and allowing for flexible, precise edits. This thesis tackles these issues by introducing a series of advancements that progressively build upon each other, enhancing the controllability and editability of text-to-music generation models. First, we introduce Loop Copilot, a system that tries to address the need for iterative refinement in music creation. Loop Copilot leverages a large language model (LLM) to coordinate multiple specialised AI models, enabling users to generate and refine music interactively through a conversational interface. Central to this system is the Global Attribute Table, which records and maintains key musical attributes throughout the iterative process, ensuring that modifications at any stage preserve the overall coherence of the music. While Loop Copilot excels in orchestrating the music creation process, it does not directly address the need for detailed edits to the generated content. To overcome this limitation, MusicMagus is presented as a further solution for editing AI-generated music. MusicMagus introduces a zero-shot text-to-music editing approach that allows for the modification of specific musical attributes, such as genre, mood, and instrumentation, without the need for retraining. By manipulating the latent space within pre-trained diffusion models, MusicMagus ensures that these edits are stylistically coherent and that non-targeted attributes remain unchanged. This system is particularly effective in maintaining the structural integrity of the music during edits, but it encounters challenges with more complex and real-world audio scenarios. ... <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12641v2-abstract-full').style.display = 'none'; document.getElementById('2411.12641v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">PhD Thesis</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12626">arXiv:2411.12626</a> <span> [<a href="https://arxiv.org/pdf/2411.12626">pdf</a>, <a href="https://arxiv.org/format/2411.12626">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Exploring the Manifold of Neural Networks Using Diffusion Geometry </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Abel%2C+E">Elliott Abel</a>, <a href="/search/cs?searchtype=author&query=Crevasse%2C+P">Peyton Crevasse</a>, <a href="/search/cs?searchtype=author&query=Grinspan%2C+Y">Yvan Grinspan</a>, <a href="/search/cs?searchtype=author&query=Mazioud%2C+S">Selma Mazioud</a>, <a href="/search/cs?searchtype=author&query=Ogundipe%2C+F">Folu Ogundipe</a>, <a href="/search/cs?searchtype=author&query=Reimann%2C+K">Kristof Reimann</a>, <a href="/search/cs?searchtype=author&query=Schueler%2C+E">Ellie Schueler</a>, <a href="/search/cs?searchtype=author&query=Steindl%2C+A+J">Andrew J. Steindl</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+E">Ellen Zhang</a>, <a href="/search/cs?searchtype=author&query=Bhaskar%2C+D">Dhananjay Bhaskar</a>, <a href="/search/cs?searchtype=author&query=Viswanath%2C+S">Siddharth Viswanath</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yanlei Zhang</a>, <a href="/search/cs?searchtype=author&query=Rudner%2C+T+G+J">Tim G. J. Rudner</a>, <a href="/search/cs?searchtype=author&query=Adelstein%2C+I">Ian Adelstein</a>, <a href="/search/cs?searchtype=author&query=Krishnaswamy%2C+S">Smita Krishnaswamy</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12626v1-abstract-short" style="display: inline;"> Drawing motivation from the manifold hypothesis, which posits that most high-dimensional data lies on or near low-dimensional manifolds, we apply manifold learning to the space of neural networks. We learn manifolds where datapoints are neural networks by introducing a distance between the hidden layer representations of the neural networks. These distances are then fed to the non-linear dimension… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12626v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12626v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12626v1-abstract-full" style="display: none;"> Drawing motivation from the manifold hypothesis, which posits that most high-dimensional data lies on or near low-dimensional manifolds, we apply manifold learning to the space of neural networks. We learn manifolds where datapoints are neural networks by introducing a distance between the hidden layer representations of the neural networks. These distances are then fed to the non-linear dimensionality reduction algorithm PHATE to create a manifold of neural networks. We characterize this manifold using features of the representation, including class separation, hierarchical cluster structure, spectral entropy, and topological structure. Our analysis reveals that high-performing networks cluster together in the manifold, displaying consistent embedding patterns across all these features. Finally, we demonstrate the utility of this approach for guiding hyperparameter optimization and neural architecture search by sampling from the manifold. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12626v1-abstract-full').style.display = 'none'; document.getElementById('2411.12626v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12586">arXiv:2411.12586</a> <span> [<a href="https://arxiv.org/pdf/2411.12586">pdf</a>, <a href="https://arxiv.org/format/2411.12586">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Infrared-Assisted Single-Stage Framework for Joint Restoration and Fusion of Visible and Infrared Images under Hazy Conditions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+H">Huafeng Li</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+J">Jiaqi Fang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yafei Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yu Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12586v1-abstract-short" style="display: inline;"> Infrared and visible (IR-VIS) image fusion has gained significant attention for its broad application value. However, existing methods often neglect the complementary role of infrared image in restoring visible image features under hazy conditions. To address this, we propose a joint learning framework that utilizes infrared image for the restoration and fusion of hazy IR-VIS images. To mitigate t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12586v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12586v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12586v1-abstract-full" style="display: none;"> Infrared and visible (IR-VIS) image fusion has gained significant attention for its broad application value. However, existing methods often neglect the complementary role of infrared image in restoring visible image features under hazy conditions. To address this, we propose a joint learning framework that utilizes infrared image for the restoration and fusion of hazy IR-VIS images. To mitigate the adverse effects of feature diversity between IR-VIS images, we introduce a prompt generation mechanism that regulates modality-specific feature incompatibility. This creates a prompt selection matrix from non-shared image information, followed by prompt embeddings generated from a prompt pool. These embeddings help generate candidate features for dehazing. We further design an infrared-assisted feature restoration mechanism that selects candidate features based on haze density, enabling simultaneous restoration and fusion within a single-stage framework. To enhance fusion quality, we construct a multi-stage prompt embedding fusion module that leverages feature supplementation from the prompt generation module. Our method effectively fuses IR-VIS images while removing haze, yielding clear, haze-free fusion results. In contrast to two-stage methods that dehaze and then fuse, our approach enables collaborative training in a single-stage framework, making the model relatively lightweight and suitable for practical deployment. Experimental results validate its effectiveness and demonstrate advantages over existing methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12586v1-abstract-full').style.display = 'none'; document.getElementById('2411.12586v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12584">arXiv:2411.12584</a> <span> [<a href="https://arxiv.org/pdf/2411.12584">pdf</a>, <a href="https://arxiv.org/format/2411.12584">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Leveraging MLLM Embeddings and Attribute Smoothing for Compositional Zero-Shot Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yan%2C+X">Xudong Yan</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+S">Songhe Feng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yang Zhang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jian Yang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yueguan Lin</a>, <a href="/search/cs?searchtype=author&query=Fei%2C+H">Haojun Fei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12584v1-abstract-short" style="display: inline;"> Compositional zero-shot learning (CZSL) aims to recognize novel compositions of attributes and objects learned from seen compositions. Previous works disentangle attribute and object by extracting shared and exclusive parts between image pairs sharing the same attribute (object), as well as aligning them with pretrained word embeddings to improve unseen attribute-object recognition. Despite the si… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12584v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12584v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12584v1-abstract-full" style="display: none;"> Compositional zero-shot learning (CZSL) aims to recognize novel compositions of attributes and objects learned from seen compositions. Previous works disentangle attribute and object by extracting shared and exclusive parts between image pairs sharing the same attribute (object), as well as aligning them with pretrained word embeddings to improve unseen attribute-object recognition. Despite the significant achievements of existing efforts, they are hampered by three limitations: (1) the efficacy of disentanglement is compromised due to the influence of the background and the intricate entanglement of attribute with object in the same parts. (2) existing word embeddings fail to capture complex multimodal semantic information. (3) overconfidence exhibited by existing models in seen compositions hinders their generalization to novel compositions. Being aware of these, we propose a novel framework named Multimodal Large Language Model (MLLM) embeddings and attribute smoothing guided disentanglement (TRIDENT) for CZSL. First, we leverage feature adaptive aggregation modules to mitigate the impact of background, and utilize learnable condition masks to capture multigranularity features for disentanglement. Then, the last hidden states of MLLM are employed as word embeddings for their superior representation capabilities. Moreover, we propose attribute smoothing with auxiliary attributes generated by Large Language Model (LLM) for seen compositions, addressing the issue of overconfidence by encouraging the model to learn more attributes in one given composition. Extensive experiments demonstrate that TRIDENT achieves state-of-the-art performance on three benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12584v1-abstract-full').style.display = 'none'; document.getElementById('2411.12584v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12530">arXiv:2411.12530</a> <span> [<a href="https://arxiv.org/pdf/2411.12530">pdf</a>, <a href="https://arxiv.org/format/2411.12530">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Contourlet Refinement Gate Framework for Thermal Spectrum Distribution Regularized Infrared Image Super-Resolution </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zou%2C+Y">Yang Zou</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhixin Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhipeng Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xingyuan Li</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+L">Long Ma</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jinyuan Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+P">Peng Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yanning Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12530v1-abstract-short" style="display: inline;"> Image super-resolution (SR) is a classical yet still active low-level vision problem that aims to reconstruct high-resolution (HR) images from their low-resolution (LR) counterparts, serving as a key technique for image enhancement. Current approaches to address SR tasks, such as transformer-based and diffusion-based methods, are either dedicated to extracting RGB image features or assuming simila… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12530v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12530v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12530v1-abstract-full" style="display: none;"> Image super-resolution (SR) is a classical yet still active low-level vision problem that aims to reconstruct high-resolution (HR) images from their low-resolution (LR) counterparts, serving as a key technique for image enhancement. Current approaches to address SR tasks, such as transformer-based and diffusion-based methods, are either dedicated to extracting RGB image features or assuming similar degradation patterns, neglecting the inherent modal disparities between infrared and visible images. When directly applied to infrared image SR tasks, these methods inevitably distort the infrared spectral distribution, compromising the machine perception in downstream tasks. In this work, we emphasize the infrared spectral distribution fidelity and propose a Contourlet refinement gate framework to restore infrared modal-specific features while preserving spectral distribution fidelity. Our approach captures high-pass subbands from multi-scale and multi-directional infrared spectral decomposition to recover infrared-degraded information through a gate architecture. The proposed Spectral Fidelity Loss regularizes the spectral frequency distribution during reconstruction, which ensures the preservation of both high- and low-frequency components and maintains the fidelity of infrared-specific features. We propose a two-stage prompt-learning optimization to guide the model in learning infrared HR characteristics from LR degradation. Extensive experiments demonstrate that our approach outperforms existing image SR models in both visual and perceptual tasks while notably enhancing machine perception in downstream tasks. Our code is available at https://github.com/hey-it-s-me/CoRPLE. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12530v1-abstract-full').style.display = 'none'; document.getElementById('2411.12530v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 figures, 6 tables</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68T45 <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.4.3 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12426">arXiv:2411.12426</a> <span> [<a href="https://arxiv.org/pdf/2411.12426">pdf</a>, <a href="https://arxiv.org/format/2411.12426">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Motif Channel Opened in a White-Box: Stereo Matching via Motif Correlation Graph </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Ziyang Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yongjun Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wenting Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bingshu Wang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yong Zhao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C+L+P">C. L. Philip Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12426v1-abstract-short" style="display: inline;"> Real-world applications of stereo matching, such as autonomous driving, place stringent demands on both safety and accuracy. However, learning-based stereo matching methods inherently suffer from the loss of geometric structures in certain feature channels, creating a bottleneck in achieving precise detail matching. Additionally, these methods lack interpretability due to the black-box nature of d… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12426v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12426v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12426v1-abstract-full" style="display: none;"> Real-world applications of stereo matching, such as autonomous driving, place stringent demands on both safety and accuracy. However, learning-based stereo matching methods inherently suffer from the loss of geometric structures in certain feature channels, creating a bottleneck in achieving precise detail matching. Additionally, these methods lack interpretability due to the black-box nature of deep learning. In this paper, we propose MoCha-V2, a novel learning-based paradigm for stereo matching. MoCha-V2 introduces the Motif Correlation Graph (MCG) to capture recurring textures, which are referred to as ``motifs" within feature channels. These motifs reconstruct geometric structures and are learned in a more interpretable way. Subsequently, we integrate features from multiple frequency domains through wavelet inverse transformation. The resulting motif features are utilized to restore geometric structures in the stereo matching process. Experimental results demonstrate the effectiveness of MoCha-V2. MoCha-V2 achieved 1st place on the Middlebury benchmark at the time of its release. Code is available at https://github.com/ZYangChen/MoCha-Stereo. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12426v1-abstract-full').style.display = 'none'; document.getElementById('2411.12426v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12175">arXiv:2411.12175</a> <span> [<a href="https://arxiv.org/pdf/2411.12175">pdf</a>, <a href="https://arxiv.org/format/2411.12175">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> AsynEIO: Asynchronous Monocular Event-Inertial Odometry Using Gaussian Process Regression </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhixiang Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xudong Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yizhai Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+F">Fan Zhang</a>, <a href="/search/cs?searchtype=author&query=Panfeng"> Panfeng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12175v1-abstract-short" style="display: inline;"> Event cameras, when combined with inertial sensors, show significant potential for motion estimation in challenging scenarios, such as high-speed maneuvers and low-light environments. There are many methods for producing such estimations, but most boil down to a synchronous discrete-time fusion problem. However, the asynchronous nature of event cameras and their unique fusion mechanism with inerti… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12175v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12175v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12175v1-abstract-full" style="display: none;"> Event cameras, when combined with inertial sensors, show significant potential for motion estimation in challenging scenarios, such as high-speed maneuvers and low-light environments. There are many methods for producing such estimations, but most boil down to a synchronous discrete-time fusion problem. However, the asynchronous nature of event cameras and their unique fusion mechanism with inertial sensors remain underexplored. In this paper, we introduce a monocular event-inertial odometry method called AsynEIO, designed to fuse asynchronous event and inertial data within a unified Gaussian Process (GP) regression framework. Our approach incorporates an event-driven frontend that tracks feature trajectories directly from raw event streams at a high temporal resolution. These tracked feature trajectories, along with various inertial factors, are integrated into the same GP regression framework to enable asynchronous fusion. With deriving analytical residual Jacobians and noise models, our method constructs a factor graph that is iteratively optimized and pruned using a sliding-window optimizer. Comparative assessments highlight the performance of different inertial fusion strategies, suggesting optimal choices for varying conditions. Experimental results on both public datasets and our own event-inertial sequences indicate that AsynEIO outperforms existing methods, especially in high-speed and low-illumination scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12175v1-abstract-full').style.display = 'none'; document.getElementById('2411.12175v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to IEEE (2024-11-4)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11799">arXiv:2411.11799</a> <span> [<a href="https://arxiv.org/pdf/2411.11799">pdf</a>, <a href="https://arxiv.org/format/2411.11799">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Edge-Enhanced Dilated Residual Attention Network for Multimodal Medical Image Fusion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+M">Meng Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yuxuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xiaolan Xu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jiayi Wang</a>, <a href="/search/cs?searchtype=author&query=Khalvati%2C+F">Farzad Khalvati</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11799v1-abstract-short" style="display: inline;"> Multimodal medical image fusion is a crucial task that combines complementary information from different imaging modalities into a unified representation, thereby enhancing diagnostic accuracy and treatment planning. While deep learning methods, particularly Convolutional Neural Networks (CNNs) and Transformers, have significantly advanced fusion performance, some of the existing CNN-based methods… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11799v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11799v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11799v1-abstract-full" style="display: none;"> Multimodal medical image fusion is a crucial task that combines complementary information from different imaging modalities into a unified representation, thereby enhancing diagnostic accuracy and treatment planning. While deep learning methods, particularly Convolutional Neural Networks (CNNs) and Transformers, have significantly advanced fusion performance, some of the existing CNN-based methods fall short in capturing fine-grained multiscale and edge features, leading to suboptimal feature integration. Transformer-based models, on the other hand, are computationally intensive in both the training and fusion stages, making them impractical for real-time clinical use. Moreover, the clinical application of fused images remains unexplored. In this paper, we propose a novel CNN-based architecture that addresses these limitations by introducing a Dilated Residual Attention Network Module for effective multiscale feature extraction, coupled with a gradient operator to enhance edge detail learning. To ensure fast and efficient fusion, we present a parameter-free fusion strategy based on the weighted nuclear norm of softmax, which requires no additional computations during training or inference. Extensive experiments, including a downstream brain tumor classification task, demonstrate that our approach outperforms various baseline methods in terms of visual quality, texture preservation, and fusion speed, making it a possible practical solution for real-world clinical applications. The code will be released at https://github.com/simonZhou86/en_dran. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11799v1-abstract-full').style.display = 'none'; document.getElementById('2411.11799v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">An extended version of the paper accepted at IEEE BIBM 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11683">arXiv:2411.11683</a> <span> [<a href="https://arxiv.org/pdf/2411.11683">pdf</a>, <a href="https://arxiv.org/format/2411.11683">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> TrojanRobot: Backdoor Attacks Against Robotic Manipulation in the Physical World </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xianlong Wang</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+H">Hewen Pan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hangtao Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Minghui Li</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+S">Shengshan Hu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Ziqi Zhou</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+L">Lulu Xue</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+P">Peijin Guo</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yichen Wang</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+W">Wei Wan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+A">Aishan Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L+Y">Leo Yu Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11683v1-abstract-short" style="display: inline;"> Robotic manipulation refers to the autonomous handling and interaction of robots with objects using advanced techniques in robotics and artificial intelligence. The advent of powerful tools such as large language models (LLMs) and large vision-language models (LVLMs) has significantly enhanced the capabilities of these robots in environmental perception and decision-making. However, the introducti… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11683v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11683v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11683v1-abstract-full" style="display: none;"> Robotic manipulation refers to the autonomous handling and interaction of robots with objects using advanced techniques in robotics and artificial intelligence. The advent of powerful tools such as large language models (LLMs) and large vision-language models (LVLMs) has significantly enhanced the capabilities of these robots in environmental perception and decision-making. However, the introduction of these intelligent agents has led to security threats such as jailbreak attacks and adversarial attacks. In this research, we take a further step by proposing a backdoor attack specifically targeting robotic manipulation and, for the first time, implementing backdoor attack in the physical world. By embedding a backdoor visual language model into the visual perception module within the robotic system, we successfully mislead the robotic arm's operation in the physical world, given the presence of common items as triggers. Experimental evaluations in the physical world demonstrate the effectiveness of the proposed backdoor attack. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11683v1-abstract-full').style.display = 'none'; document.getElementById('2411.11683v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Initial version with preliminary results. We welcome any feedback or suggestions</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11627">arXiv:2411.11627</a> <span> [<a href="https://arxiv.org/pdf/2411.11627">pdf</a>, <a href="https://arxiv.org/ps/2411.11627">ps</a>, <a href="https://arxiv.org/format/2411.11627">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Combinatorics">math.CO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computational Complexity">cs.CC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Discrete Mathematics">cs.DM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Data Structures and Algorithms">cs.DS</span> </div> </div> <p class="title is-5 mathjax"> Explicit Two-Sided Vertex Expanders Beyond the Spectral Barrier </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hsieh%2C+J">Jun-Ting Hsieh</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+T">Ting-Chun Lin</a>, <a href="/search/cs?searchtype=author&query=Mohanty%2C+S">Sidhanth Mohanty</a>, <a href="/search/cs?searchtype=author&query=O%27Donnell%2C+R">Ryan O'Donnell</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R+Y">Rachel Yun Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11627v1-abstract-short" style="display: inline;"> We construct the first explicit two-sided vertex expanders that bypass the spectral barrier. Previously, the strongest known explicit vertex expanders were given by $d$-regular Ramanujan graphs, whose spectral properties imply that every small subset of vertices $S$ has at least $0.5d|S|$ distinct neighbors. However, it is possible to construct Ramanujan graphs containing a small set $S$ with no… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11627v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11627v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11627v1-abstract-full" style="display: none;"> We construct the first explicit two-sided vertex expanders that bypass the spectral barrier. Previously, the strongest known explicit vertex expanders were given by $d$-regular Ramanujan graphs, whose spectral properties imply that every small subset of vertices $S$ has at least $0.5d|S|$ distinct neighbors. However, it is possible to construct Ramanujan graphs containing a small set $S$ with no more than $0.5d|S|$ neighbors. In fact, no explicit construction was known to break the $0.5 d$-barrier. In this work, we give an explicit construction of an infinite family of $d$-regular graphs (for large enough $d$) where every small set expands by a factor of $\approx 0.6d$. More generally, for large enough $d_1,d_2$, we give an infinite family of $(d_1,d_2)$-biregular graphs where small sets on the left expand by a factor of $\approx 0.6d_1$, and small sets on the right expand by a factor of $\approx 0.6d_2$. In fact, our construction satisfies an even stronger property: small sets on the left and right have unique-neighbor expansion $0.6d_1$ and $0.6d_2$ respectively. Our construction follows the tripartite line product framework of Hsieh, McKenzie, Mohanty & Paredes, and instantiates it using the face-vertex incidence of the $4$-dimensional Ramanujan clique complex as its base component. As a key part of our analysis, we derive new bounds on the triangle density of small sets in the Ramanujan clique complex. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11627v1-abstract-full').style.display = 'none'; document.getElementById('2411.11627v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">28 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11487">arXiv:2411.11487</a> <span> [<a href="https://arxiv.org/pdf/2411.11487">pdf</a>, <a href="https://arxiv.org/format/2411.11487">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Look a Group at Once: Multi-Slide Modeling for Survival Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+X">Xinyang Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yi Zhang</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+Y">Yi Xie</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jianfei Yang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xi Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hao Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Haixian Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11487v1-abstract-short" style="display: inline;"> Survival prediction is a critical task in pathology. In clinical practice, pathologists often examine multiple cases, leveraging a broader spectrum of cancer phenotypes to enhance pathological assessment. Despite significant advancements in deep learning, current solutions typically model each slide as a sample, struggling to effectively capture comparable and slide-agnostic pathological features.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11487v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11487v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11487v1-abstract-full" style="display: none;"> Survival prediction is a critical task in pathology. In clinical practice, pathologists often examine multiple cases, leveraging a broader spectrum of cancer phenotypes to enhance pathological assessment. Despite significant advancements in deep learning, current solutions typically model each slide as a sample, struggling to effectively capture comparable and slide-agnostic pathological features. In this paper, we introduce GroupMIL, a novel framework inspired by the clinical practice of collective analysis, which models multiple slides as a single sample and organizes groups of patches and slides sequentially to capture cross-slide prognostic features. We also present GPAMamba, a model designed to facilitate intra- and inter-slide feature interactions, effectively capturing local micro-environmental characteristics within slide-level graphs while uncovering essential prognostic patterns across an extended patch sequence within the group framework. Furthermore, we develop a dual-head predictor that delivers comprehensive survival risk and probability assessments for each patient. Extensive empirical evaluations demonstrate that our model significantly outperforms state-of-the-art approaches across five datasets from The Cancer Genome Atlas. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11487v1-abstract-full').style.display = 'none'; document.getElementById('2411.11487v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11396">arXiv:2411.11396</a> <span> [<a href="https://arxiv.org/pdf/2411.11396">pdf</a>, <a href="https://arxiv.org/format/2411.11396">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Stacking Brick by Brick: Aligned Feature Isolation for Incremental Face Forgery Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cheng%2C+J">Jikang Cheng</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+Z">Zhiyuan Yan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Ying Zhang</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+L">Li Hao</a>, <a href="/search/cs?searchtype=author&query=Ai%2C+J">Jiaxin Ai</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+Q">Qin Zou</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chen Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhongyuan Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11396v2-abstract-short" style="display: inline;"> The rapid advancement of face forgery techniques has introduced a growing variety of forgeries. Incremental Face Forgery Detection (IFFD), involving gradually adding new forgery data to fine-tune the previously trained model, has been introduced as a promising strategy to deal with evolving forgery methods. However, a naively trained IFFD model is prone to catastrophic forgetting when new forgerie… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11396v2-abstract-full').style.display = 'inline'; document.getElementById('2411.11396v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11396v2-abstract-full" style="display: none;"> The rapid advancement of face forgery techniques has introduced a growing variety of forgeries. Incremental Face Forgery Detection (IFFD), involving gradually adding new forgery data to fine-tune the previously trained model, has been introduced as a promising strategy to deal with evolving forgery methods. However, a naively trained IFFD model is prone to catastrophic forgetting when new forgeries are integrated, as treating all forgeries as a single ''Fake" class in the Real/Fake classification can cause different forgery types overriding one another, thereby resulting in the forgetting of unique characteristics from earlier tasks and limiting the model's effectiveness in learning forgery specificity and generality. In this paper, we propose to stack the latent feature distributions of previous and new tasks brick by brick, $\textit{i.e.}$, achieving $\textbf{aligned feature isolation}$. In this manner, we aim to preserve learned forgery information and accumulate new knowledge by minimizing distribution overriding, thereby mitigating catastrophic forgetting. To achieve this, we first introduce Sparse Uniform Replay (SUR) to obtain the representative subsets that could be treated as the uniformly sparse versions of the previous global distributions. We then propose a Latent-space Incremental Detector (LID) that leverages SUR data to isolate and align distributions. For evaluation, we construct a more advanced and comprehensive benchmark tailored for IFFD. The leading experimental results validate the superiority of our method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11396v2-abstract-full').style.display = 'none'; document.getElementById('2411.11396v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11364">arXiv:2411.11364</a> <span> [<a href="https://arxiv.org/pdf/2411.11364">pdf</a>, <a href="https://arxiv.org/format/2411.11364">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Continual Task Learning through Adaptive Policy Self-Composition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+S">Shengchao Hu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yuhang Zhou</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+Z">Ziqing Fan</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+J">Jifeng Hu</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+L">Li Shen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Ya Zhang</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+D">Dacheng Tao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11364v1-abstract-short" style="display: inline;"> Training a generalizable agent to continually learn a sequence of tasks from offline trajectories is a natural requirement for long-lived agents, yet remains a significant challenge for current offline reinforcement learning (RL) algorithms. Specifically, an agent must be able to rapidly adapt to new tasks using newly collected trajectories (plasticity), while retaining knowledge from previously l… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11364v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11364v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11364v1-abstract-full" style="display: none;"> Training a generalizable agent to continually learn a sequence of tasks from offline trajectories is a natural requirement for long-lived agents, yet remains a significant challenge for current offline reinforcement learning (RL) algorithms. Specifically, an agent must be able to rapidly adapt to new tasks using newly collected trajectories (plasticity), while retaining knowledge from previously learned tasks (stability). However, systematic analyses of this setting are scarce, and it remains unclear whether conventional continual learning (CL) methods are effective in continual offline RL (CORL) scenarios. In this study, we develop the Offline Continual World benchmark and demonstrate that traditional CL methods struggle with catastrophic forgetting, primarily due to the unique distribution shifts inherent to CORL scenarios. To address this challenge, we introduce CompoFormer, a structure-based continual transformer model that adaptively composes previous policies via a meta-policy network. Upon encountering a new task, CompoFormer leverages semantic correlations to selectively integrate relevant prior policies alongside newly trained parameters, thereby enhancing knowledge sharing and accelerating the learning process. Our experiments reveal that CompoFormer outperforms conventional CL methods, particularly in longer task sequences, showcasing a promising balance between plasticity and stability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11364v1-abstract-full').style.display = 'none'; document.getElementById('2411.11364v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">21 pages, 8 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11357">arXiv:2411.11357</a> <span> [<a href="https://arxiv.org/pdf/2411.11357">pdf</a>, <a href="https://arxiv.org/format/2411.11357">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Text-guided Zero-Shot Object Localization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jingjing Wang</a>, <a href="/search/cs?searchtype=author&query=Piao%2C+X">Xinglin Piao</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Zongzhi Gao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Bo Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yong Zhang</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+B">Baocai Yin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11357v1-abstract-short" style="display: inline;"> Object localization is a hot issue in computer vision area, which aims to identify and determine the precise location of specific objects from image or video. Most existing object localization methods heavily rely on extensive labeled data, which are costly to annotate and constrain their applicability. Therefore, we propose a new Zero-Shot Object Localization (ZSOL) framework for addressing the a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11357v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11357v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11357v1-abstract-full" style="display: none;"> Object localization is a hot issue in computer vision area, which aims to identify and determine the precise location of specific objects from image or video. Most existing object localization methods heavily rely on extensive labeled data, which are costly to annotate and constrain their applicability. Therefore, we propose a new Zero-Shot Object Localization (ZSOL) framework for addressing the aforementioned challenges. In the proposed framework, we introduce the Contrastive Language Image Pre-training (CLIP) module which could integrate visual and linguistic information effectively. Furthermore, we design a Text Self-Similarity Matching (TSSM) module, which could improve the localization accuracy by enhancing the representation of text features extracted by CLIP module. Hence, the proposed framework can be guided by prompt words to identify and locate specific objects in an image in the absence of labeled samples. The results of extensive experiments demonstrate that the proposed method could improve the localization performance significantly and establishes an effective benchmark for further research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11357v1-abstract-full').style.display = 'none'; document.getElementById('2411.11357v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10883">arXiv:2411.10883</a> <span> [<a href="https://arxiv.org/pdf/2411.10883">pdf</a>, <a href="https://arxiv.org/format/2411.10883">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> I Know What You Sync: Covert and Side Channel Attacks on File Systems via syncfs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gu%2C+C">Cheng Gu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yicheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Abu-Ghazaleh%2C+N">Nael Abu-Ghazaleh</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10883v1-abstract-short" style="display: inline;"> Operating Systems enforce logical isolation using abstractions such as processes, containers, and isolation technologies to protect a system from malicious or buggy code. In this paper, we show new types of side channels through the file system that break this logical isolation. The file system plays a critical role in the operating system, managing all I/O activities between the application layer… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10883v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10883v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10883v1-abstract-full" style="display: none;"> Operating Systems enforce logical isolation using abstractions such as processes, containers, and isolation technologies to protect a system from malicious or buggy code. In this paper, we show new types of side channels through the file system that break this logical isolation. The file system plays a critical role in the operating system, managing all I/O activities between the application layer and the physical storage device. We observe that the file system implementation is shared, leading to timing leakage when using common I/O system calls. Specifically, we found that modern operating systems take advantage of any flush operation (which saves cached blocks in memory to the SSD or disk) to flush all of the I/O buffers, even those used by other isolation domains. Thus, by measuring the delay of syncfs, the attacker can infer the I/O behavior of victim programs. We then demonstrate a syncfs covert channel attack on multiple file systems, including both Linux native file systems and the Windows file system, achieving a maximum bandwidth of 5 Kbps with an error rate of 0.15% on Linux and 7.6 Kbps with an error rate of 1.9% on Windows. In addition, we construct three side-channel attacks targeting both Linux and Android devices. On Linux devices, we implement a website fingerprinting attack and a video fingerprinting attack by tracking the write patterns of temporary buffering files. On Android devices, we design an application fingerprinting attack that leaks application write patterns during boot-up. The attacks achieve over 90% F1 score, precision, and recall. Finally, we demonstrate that these attacks can be exploited across containers implementing a container detection technique and a cross-container covert channel attack. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10883v1-abstract-full').style.display = 'none'; document.getElementById('2411.10883v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10742">arXiv:2411.10742</a> <span> [<a href="https://arxiv.org/pdf/2411.10742">pdf</a>, <a href="https://arxiv.org/format/2411.10742">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> It Takes Two: Accurate Gait Recognition in the Wild via Cross-granularity Alignment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zheng%2C+J">Jinkai Zheng</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xinchen Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Boyue Zhang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+C">Chenggang Yan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiyong Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+W">Wu Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yongdong Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10742v1-abstract-short" style="display: inline;"> Existing studies for gait recognition primarily utilized sequences of either binary silhouette or human parsing to encode the shapes and dynamics of persons during walking. Silhouettes exhibit accurate segmentation quality and robustness to environmental variations, but their low information entropy may result in sub-optimal performance. In contrast, human parsing provides fine-grained part segmen… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10742v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10742v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10742v1-abstract-full" style="display: none;"> Existing studies for gait recognition primarily utilized sequences of either binary silhouette or human parsing to encode the shapes and dynamics of persons during walking. Silhouettes exhibit accurate segmentation quality and robustness to environmental variations, but their low information entropy may result in sub-optimal performance. In contrast, human parsing provides fine-grained part segmentation with higher information entropy, but the segmentation quality may deteriorate due to the complex environments. To discover the advantages of silhouette and parsing and overcome their limitations, this paper proposes a novel cross-granularity alignment gait recognition method, named XGait, to unleash the power of gait representations of different granularity. To achieve this goal, the XGait first contains two branches of backbone encoders to map the silhouette sequences and the parsing sequences into two latent spaces, respectively. Moreover, to explore the complementary knowledge across the features of two representations, we design the Global Cross-granularity Module (GCM) and the Part Cross-granularity Module (PCM) after the two encoders. In particular, the GCM aims to enhance the quality of parsing features by leveraging global features from silhouettes, while the PCM aligns the dynamics of human parts between silhouette and parsing features using the high information entropy in parsing sequences. In addition, to effectively guide the alignment of two representations with different granularity at the part level, an elaborate-designed learnable division mechanism is proposed for the parsing features. Comprehensive experiments on two large-scale gait datasets not only show the superior performance of XGait with the Rank-1 accuracy of 80.5% on Gait3D and 88.3% CCPG but also reflect the robustness of the learned features even under challenging conditions like occlusions and cloth changes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10742v1-abstract-full').style.display = 'none'; document.getElementById('2411.10742v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 9 figures; Accepted by ACM MM 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10729">arXiv:2411.10729</a> <span> [<a href="https://arxiv.org/pdf/2411.10729">pdf</a>, <a href="https://arxiv.org/format/2411.10729">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computational Engineering, Finance, and Science">cs.CE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> On-device Anomaly Detection in Conveyor Belt Operations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Martinez-Rau%2C+L+S">Luciano S. Martinez-Rau</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yuxuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Oelmann%2C+B">Bengt Oelmann</a>, <a href="/search/cs?searchtype=author&query=Bader%2C+S">Sebastian Bader</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10729v1-abstract-short" style="display: inline;"> Mining 4.0 leverages advancements in automation, digitalization, and interconnected technologies from Industry 4.0 to address the unique challenges of the mining sector, enhancing efficiency, safety, and sustainability. Conveyor belts are crucial in mining operations by enabling the continuous and efficient movement of bulk materials over long distances, which directly impacts productivity. While… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10729v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10729v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10729v1-abstract-full" style="display: none;"> Mining 4.0 leverages advancements in automation, digitalization, and interconnected technologies from Industry 4.0 to address the unique challenges of the mining sector, enhancing efficiency, safety, and sustainability. Conveyor belts are crucial in mining operations by enabling the continuous and efficient movement of bulk materials over long distances, which directly impacts productivity. While detecting anomalies in specific conveyor belt components, such as idlers, pulleys, and belt surfaces, has been widely studied, identifying the root causes of these failures remains critical due to factors like changing production conditions and operator errors. Continuous monitoring of mining conveyor belt work cycles for anomaly detection is still at an early stage and requires robust solutions. This study proposes two distinctive pattern recognition approaches for real-time anomaly detection in the operational cycles of mining conveyor belts, combining feature extraction, threshold-based cycle detection, and tiny machine-learning classification. Both approaches outperformed a state-of-the-art technique on two datasets for duty cycle classification in terms of F1-scores. The first approach, with 97.3% and 80.2% for normal and abnormal cycles, respectively, reaches the highest performance in the first dataset while the second approach excels on the second dataset, scoring 91.3% and 67.9%. Implemented on two low-power microcontrollers, the methods demonstrated efficient, real-time operation with energy consumption of 13.3 and 20.6 $渭$J during inference. These results offer valuable insights for detecting mechanical failure sources, supporting targeted preventive maintenance, and optimizing production cycles. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10729v1-abstract-full').style.display = 'none'; document.getElementById('2411.10729v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Preprint submitted to IEEE Transactions on Instrumentation and Measurement</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10683">arXiv:2411.10683</a> <span> [<a href="https://arxiv.org/pdf/2411.10683">pdf</a>, <a href="https://arxiv.org/format/2411.10683">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> I'm Spartacus, No, I'm Spartacus: Measuring and Understanding LLM Identity Confusion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+K">Kun Li</a>, <a href="/search/cs?searchtype=author&query=Zhuang%2C+S">Shichao Zhuang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yue Zhang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+M">Minghui Xu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+R">Ruoxi Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+K">Kaidi Xu</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+X">Xinwen Fu</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+X">Xiuzhen Cheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10683v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) excel in diverse tasks such as text generation, data analysis, and software development, making them indispensable across domains like education, business, and creative industries. However, the rapid proliferation of LLMs (with over 560 companies developing or deploying them as of 2024) has raised concerns about their originality and trustworthiness. A notable issue, t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10683v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10683v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10683v1-abstract-full" style="display: none;"> Large Language Models (LLMs) excel in diverse tasks such as text generation, data analysis, and software development, making them indispensable across domains like education, business, and creative industries. However, the rapid proliferation of LLMs (with over 560 companies developing or deploying them as of 2024) has raised concerns about their originality and trustworthiness. A notable issue, termed identity confusion, has emerged, where LLMs misrepresent their origins or identities. This study systematically examines identity confusion through three research questions: (1) How prevalent is identity confusion among LLMs? (2) Does it arise from model reuse, plagiarism, or hallucination? (3) What are the security and trust-related impacts of identity confusion? To address these, we developed an automated tool combining documentation analysis, self-identity recognition testing, and output similarity comparisons--established methods for LLM fingerprinting--and conducted a structured survey via Credamo to assess its impact on user trust. Our analysis of 27 LLMs revealed that 25.93% exhibit identity confusion. Output similarity analysis confirmed that these issues stem from hallucinations rather than replication or reuse. Survey results further highlighted that identity confusion significantly erodes trust, particularly in critical tasks like education and professional use, with declines exceeding those caused by logical errors or inconsistencies. Users attributed these failures to design flaws, incorrect training data, and perceived plagiarism, underscoring the systemic risks posed by identity confusion to LLM reliability and trustworthiness. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10683v1-abstract-full').style.display = 'none'; document.getElementById('2411.10683v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">16 pages, 8 figure, 6 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10665">arXiv:2411.10665</a> <span> [<a href="https://arxiv.org/pdf/2411.10665">pdf</a>, <a href="https://arxiv.org/format/2411.10665">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> AutoIoT: Automated IoT Platform Using Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cheng%2C+Y">Ye Cheng</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+M">Minghui Xu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yue Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+K">Kun Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+R">Ruoxi Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+L">Lian Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10665v1-abstract-short" style="display: inline;"> IoT platforms, particularly smart home platforms providing significant convenience to people's lives such as Apple HomeKit and Samsung SmartThings, allow users to create automation rules through trigger-action programming. However, some users may lack the necessary knowledge to formulate automation rules, thus preventing them from fully benefiting from the conveniences offered by smart home techno… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10665v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10665v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10665v1-abstract-full" style="display: none;"> IoT platforms, particularly smart home platforms providing significant convenience to people's lives such as Apple HomeKit and Samsung SmartThings, allow users to create automation rules through trigger-action programming. However, some users may lack the necessary knowledge to formulate automation rules, thus preventing them from fully benefiting from the conveniences offered by smart home technology. To address this, smart home platforms provide pre-defined automation policies based on the smart home devices registered by the user. Nevertheless, these policies, being pre-generated and relatively simple, fail to adequately cover the diverse needs of users. Furthermore, conflicts may arise between automation rules, and integrating conflict detection into the IoT platform increases the burden on developers. In this paper, we propose AutoIoT, an automated IoT platform based on Large Language Models (LLMs) and formal verification techniques, designed to achieve end-to-end automation through device information extraction, LLM-based rule generation, conflict detection, and avoidance. AutoIoT can help users generate conflict-free automation rules and assist developers in generating codes for conflict detection, thereby enhancing their experience. A code adapter has been designed to separate logical reasoning from the syntactic details of code generation, enabling LLMs to generate code for programming languages beyond their training data. Finally, we evaluated the performance of AutoIoT and presented a case study demonstrating how AutoIoT can integrate with existing IoT platforms. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10665v1-abstract-full').style.display = 'none'; document.getElementById('2411.10665v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 10 figures, 6 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10649">arXiv:2411.10649</a> <span> [<a href="https://arxiv.org/pdf/2411.10649">pdf</a>, <a href="https://arxiv.org/format/2411.10649">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Deep Loss Convexification for Learning Iterative Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Ziming Zhang</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+Y">Yuping Shao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yiqing Zhang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+F">Fangzhou Lin</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Haichong Zhang</a>, <a href="/search/cs?searchtype=author&query=Rundensteiner%2C+E">Elke Rundensteiner</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10649v1-abstract-short" style="display: inline;"> Iterative methods such as iterative closest point (ICP) for point cloud registration often suffer from bad local optimality (e.g. saddle points), due to the nature of nonconvex optimization. To address this fundamental challenge, in this paper we propose learning to form the loss landscape of a deep iterative method w.r.t. predictions at test time into a convex-like shape locally around each groun… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10649v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10649v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10649v1-abstract-full" style="display: none;"> Iterative methods such as iterative closest point (ICP) for point cloud registration often suffer from bad local optimality (e.g. saddle points), due to the nature of nonconvex optimization. To address this fundamental challenge, in this paper we propose learning to form the loss landscape of a deep iterative method w.r.t. predictions at test time into a convex-like shape locally around each ground truth given data, namely Deep Loss Convexification (DLC), thanks to the overparametrization in neural networks. To this end, we formulate our learning objective based on adversarial training by manipulating the ground-truth predictions, rather than input data. In particular, we propose using star-convexity, a family of structured nonconvex functions that are unimodal on all lines that pass through a global minimizer, as our geometric constraint for reshaping loss landscapes, leading to (1) extra novel hinge losses appended to the original loss and (2) near-optimal predictions. We demonstrate the state-of-the-art performance using DLC with existing network architectures for the tasks of training recurrent neural networks (RNNs), 3D point cloud registration, and multimodel image alignment. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10649v1-abstract-full').style.display = 'none'; document.getElementById('2411.10649v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 10 figures, accepted paper to Transactions on Pattern Analysis and Machine Intelligence. arXiv admin note: text overlap with arXiv:2303.11526</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10606">arXiv:2411.10606</a> <span> [<a href="https://arxiv.org/pdf/2411.10606">pdf</a>, <a href="https://arxiv.org/format/2411.10606">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> AmoebaLLM: Constructing Any-Shape Large Language Models for Efficient and Instant Deployment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fu%2C+Y">Yonggan Fu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Z">Zhongzhi Yu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Junwei Li</a>, <a href="/search/cs?searchtype=author&query=Qian%2C+J">Jiayi Qian</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yongan Zhang</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+X">Xiangchi Yuan</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+D">Dachuan Shi</a>, <a href="/search/cs?searchtype=author&query=Yakunin%2C+R">Roman Yakunin</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y+C">Yingyan Celine Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10606v1-abstract-short" style="display: inline;"> Motivated by the transformative capabilities of large language models (LLMs) across various natural language tasks, there has been a growing demand to deploy these models effectively across diverse real-world applications and platforms. However, the challenge of efficiently deploying LLMs has become increasingly pronounced due to the varying application-specific performance requirements and the ra… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10606v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10606v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10606v1-abstract-full" style="display: none;"> Motivated by the transformative capabilities of large language models (LLMs) across various natural language tasks, there has been a growing demand to deploy these models effectively across diverse real-world applications and platforms. However, the challenge of efficiently deploying LLMs has become increasingly pronounced due to the varying application-specific performance requirements and the rapid evolution of computational platforms, which feature diverse resource constraints and deployment flows. These varying requirements necessitate LLMs that can adapt their structures (depth and width) for optimal efficiency across different platforms and application specifications. To address this critical gap, we propose AmoebaLLM, a novel framework designed to enable the instant derivation of LLM subnets of arbitrary shapes, which achieve the accuracy-efficiency frontier and can be extracted immediately after a one-time fine-tuning. In this way, AmoebaLLM significantly facilitates rapid deployment tailored to various platforms and applications. Specifically, AmoebaLLM integrates three innovative components: (1) a knowledge-preserving subnet selection strategy that features a dynamic-programming approach for depth shrinking and an importance-driven method for width shrinking; (2) a shape-aware mixture of LoRAs to mitigate gradient conflicts among subnets during fine-tuning; and (3) an in-place distillation scheme with loss-magnitude balancing as the fine-tuning objective. Extensive experiments validate that AmoebaLLM not only sets new standards in LLM adaptability but also successfully delivers subnets that achieve state-of-the-art trade-offs between accuracy and efficiency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10606v1-abstract-full').style.display = 'none'; document.getElementById('2411.10606v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10570">arXiv:2411.10570</a> <span> [<a href="https://arxiv.org/pdf/2411.10570">pdf</a>, <a href="https://arxiv.org/format/2411.10570">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Normative Modeling for AD Diagnosis and Biomarker Identification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Songlin Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+R">Rong Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yu Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yong Chen</a>, <a href="/search/cs?searchtype=author&query=He%2C+L">Lifang He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10570v1-abstract-short" style="display: inline;"> In this paper, we introduce a novel normative modeling approach that incorporates focal loss and adversarial autoencoders (FAAE) for Alzheimer's Disease (AD) diagnosis and biomarker identification. Our method is an end-to-end approach that embeds an adversarial focal loss discriminator within the autoencoder structure, specifically designed to effectively target and capture more complex and challe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10570v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10570v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10570v1-abstract-full" style="display: none;"> In this paper, we introduce a novel normative modeling approach that incorporates focal loss and adversarial autoencoders (FAAE) for Alzheimer's Disease (AD) diagnosis and biomarker identification. Our method is an end-to-end approach that embeds an adversarial focal loss discriminator within the autoencoder structure, specifically designed to effectively target and capture more complex and challenging cases. We first use the enhanced autoencoder to create a normative model based on data from healthy control (HC) individuals. We then apply this model to estimate total and regional neuroanatomical deviation in AD patients. Through extensive experiments on the OASIS-3 and ADNI datasets, our approach significantly outperforms previous state-of-the-art methods. This advancement not only streamlines the detection process but also provides a greater insight into the biomarker potential for AD. Our code can be found at \url{https://github.com/soz223/FAAE}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10570v1-abstract-full').style.display = 'none'; document.getElementById('2411.10570v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 3 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10548">arXiv:2411.10548</a> <span> [<a href="https://arxiv.org/pdf/2411.10548">pdf</a>, <a href="https://arxiv.org/ps/2411.10548">ps</a>, <a href="https://arxiv.org/format/2411.10548">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Biomolecules">q-bio.BM</span> </div> </div> <p class="title is-5 mathjax"> BioNeMo Framework: a modular, high-performance library for AI model development in drug discovery </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=John%2C+P+S">Peter St. John</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+D">Dejun Lin</a>, <a href="/search/cs?searchtype=author&query=Binder%2C+P">Polina Binder</a>, <a href="/search/cs?searchtype=author&query=Greaves%2C+M">Malcolm Greaves</a>, <a href="/search/cs?searchtype=author&query=Shah%2C+V">Vega Shah</a>, <a href="/search/cs?searchtype=author&query=John%2C+J+S">John St. John</a>, <a href="/search/cs?searchtype=author&query=Lange%2C+A">Adrian Lange</a>, <a href="/search/cs?searchtype=author&query=Hsu%2C+P">Patrick Hsu</a>, <a href="/search/cs?searchtype=author&query=Illango%2C+R">Rajesh Illango</a>, <a href="/search/cs?searchtype=author&query=Ramanathan%2C+A">Arvind Ramanathan</a>, <a href="/search/cs?searchtype=author&query=Anandkumar%2C+A">Anima Anandkumar</a>, <a href="/search/cs?searchtype=author&query=Brookes%2C+D+H">David H Brookes</a>, <a href="/search/cs?searchtype=author&query=Busia%2C+A">Akosua Busia</a>, <a href="/search/cs?searchtype=author&query=Mahajan%2C+A">Abhishaike Mahajan</a>, <a href="/search/cs?searchtype=author&query=Malina%2C+S">Stephen Malina</a>, <a href="/search/cs?searchtype=author&query=Prasad%2C+N">Neha Prasad</a>, <a href="/search/cs?searchtype=author&query=Sinai%2C+S">Sam Sinai</a>, <a href="/search/cs?searchtype=author&query=Edwards%2C+L">Lindsay Edwards</a>, <a href="/search/cs?searchtype=author&query=Gaudelet%2C+T">Thomas Gaudelet</a>, <a href="/search/cs?searchtype=author&query=Regep%2C+C">Cristian Regep</a>, <a href="/search/cs?searchtype=author&query=Steinegger%2C+M">Martin Steinegger</a>, <a href="/search/cs?searchtype=author&query=Rost%2C+B">Burkhard Rost</a>, <a href="/search/cs?searchtype=author&query=Brace%2C+A">Alexander Brace</a>, <a href="/search/cs?searchtype=author&query=Hippe%2C+K">Kyle Hippe</a>, <a href="/search/cs?searchtype=author&query=Naef%2C+L">Luca Naef</a> , et al. (63 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10548v1-abstract-short" style="display: inline;"> Artificial Intelligence models encoding biology and chemistry are opening new routes to high-throughput and high-quality in-silico drug development. However, their training increasingly relies on computational scale, with recent protein language models (pLM) training on hundreds of graphical processing units (GPUs). We introduce the BioNeMo Framework to facilitate the training of computational bio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10548v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10548v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10548v1-abstract-full" style="display: none;"> Artificial Intelligence models encoding biology and chemistry are opening new routes to high-throughput and high-quality in-silico drug development. However, their training increasingly relies on computational scale, with recent protein language models (pLM) training on hundreds of graphical processing units (GPUs). We introduce the BioNeMo Framework to facilitate the training of computational biology and chemistry AI models across hundreds of GPUs. Its modular design allows the integration of individual components, such as data loaders, into existing workflows and is open to community contributions. We detail technical features of the BioNeMo Framework through use cases such as pLM pre-training and fine-tuning. On 256 NVIDIA A100s, BioNeMo Framework trains a three billion parameter BERT-based pLM on over one trillion tokens in 4.2 days. The BioNeMo Framework is open-source and free for everyone to use. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10548v1-abstract-full').style.display = 'none'; document.getElementById('2411.10548v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10414">arXiv:2411.10414</a> <span> [<a href="https://arxiv.org/pdf/2411.10414">pdf</a>, <a href="https://arxiv.org/format/2411.10414">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Llama Guard 3 Vision: Safeguarding Human-AI Image Understanding Conversations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chi%2C+J">Jianfeng Chi</a>, <a href="/search/cs?searchtype=author&query=Karn%2C+U">Ujjwal Karn</a>, <a href="/search/cs?searchtype=author&query=Zhan%2C+H">Hongyuan Zhan</a>, <a href="/search/cs?searchtype=author&query=Smith%2C+E">Eric Smith</a>, <a href="/search/cs?searchtype=author&query=Rando%2C+J">Javier Rando</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yiming Zhang</a>, <a href="/search/cs?searchtype=author&query=Plawiak%2C+K">Kate Plawiak</a>, <a href="/search/cs?searchtype=author&query=Coudert%2C+Z+D">Zacharie Delpierre Coudert</a>, <a href="/search/cs?searchtype=author&query=Upasani%2C+K">Kartikeya Upasani</a>, <a href="/search/cs?searchtype=author&query=Pasupuleti%2C+M">Mahesh Pasupuleti</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10414v1-abstract-short" style="display: inline;"> We introduce Llama Guard 3 Vision, a multimodal LLM-based safeguard for human-AI conversations that involves image understanding: it can be used to safeguard content for both multimodal LLM inputs (prompt classification) and outputs (response classification). Unlike the previous text-only Llama Guard versions (Inan et al., 2023; Llama Team, 2024b,a), it is specifically designed to support image re… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10414v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10414v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10414v1-abstract-full" style="display: none;"> We introduce Llama Guard 3 Vision, a multimodal LLM-based safeguard for human-AI conversations that involves image understanding: it can be used to safeguard content for both multimodal LLM inputs (prompt classification) and outputs (response classification). Unlike the previous text-only Llama Guard versions (Inan et al., 2023; Llama Team, 2024b,a), it is specifically designed to support image reasoning use cases and is optimized to detect harmful multimodal (text and image) prompts and text responses to these prompts. Llama Guard 3 Vision is fine-tuned on Llama 3.2-Vision and demonstrates strong performance on the internal benchmarks using the MLCommons taxonomy. We also test its robustness against adversarial attacks. We believe that Llama Guard 3 Vision serves as a good starting point to build more capable and robust content moderation tools for human-AI conversation with multimodal capabilities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10414v1-abstract-full').style.display = 'none'; document.getElementById('2411.10414v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10346">arXiv:2411.10346</a> <span> [<a href="https://arxiv.org/pdf/2411.10346">pdf</a>, <a href="https://arxiv.org/format/2411.10346">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> BiDense: Binarization for Dense Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yin%2C+R">Rui Yin</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+H">Haotong Qin</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yulun Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wenbo Li</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Y">Yong Guo</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jianjun Zhu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Cheng Wang</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+B">Biao Jia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10346v2-abstract-short" style="display: inline;"> Dense prediction is a critical task in computer vision. However, previous methods often require extensive computational resources, which hinders their real-world application. In this paper, we propose BiDense, a generalized binary neural network (BNN) designed for efficient and accurate dense prediction tasks. BiDense incorporates two key techniques: the Distribution-adaptive Binarizer (DAB) and t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10346v2-abstract-full').style.display = 'inline'; document.getElementById('2411.10346v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10346v2-abstract-full" style="display: none;"> Dense prediction is a critical task in computer vision. However, previous methods often require extensive computational resources, which hinders their real-world application. In this paper, we propose BiDense, a generalized binary neural network (BNN) designed for efficient and accurate dense prediction tasks. BiDense incorporates two key techniques: the Distribution-adaptive Binarizer (DAB) and the Channel-adaptive Full-precision Bypass (CFB). The DAB adaptively calculates thresholds and scaling factors for binarization, effectively retaining more information within BNNs. Meanwhile, the CFB facilitates full-precision bypassing for binary convolutional layers undergoing various channel size transformations, which enhances the propagation of real-valued signals and minimizes information loss. By leveraging these techniques, BiDense preserves more real-valued information, enabling more accurate and detailed dense predictions in BNNs. Extensive experiments demonstrate that our framework achieves performance levels comparable to full-precision models while significantly reducing memory usage and computational costs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10346v2-abstract-full').style.display = 'none'; document.getElementById('2411.10346v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10232">arXiv:2411.10232</a> <span> [<a href="https://arxiv.org/pdf/2411.10232">pdf</a>, <a href="https://arxiv.org/format/2411.10232">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> ColorEdit: Training-free Image-Guided Color editing with diffusion model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yin%2C+X">Xingxi Yin</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhi Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jingfeng Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chenglin Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yin Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10232v1-abstract-short" style="display: inline;"> Text-to-image (T2I) diffusion models, with their impressive generative capabilities, have been adopted for image editing tasks, demonstrating remarkable efficacy. However, due to attention leakage and collision between the cross-attention map of the object and the new color attribute from the text prompt, text-guided image editing methods may fail to change the color of an object, resulting in a m… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10232v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10232v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10232v1-abstract-full" style="display: none;"> Text-to-image (T2I) diffusion models, with their impressive generative capabilities, have been adopted for image editing tasks, demonstrating remarkable efficacy. However, due to attention leakage and collision between the cross-attention map of the object and the new color attribute from the text prompt, text-guided image editing methods may fail to change the color of an object, resulting in a misalignment between the resulting image and the text prompt. In this paper, we conduct an in-depth analysis on the process of text-guided image synthesizing and what semantic information different cross-attention blocks have learned. We observe that the visual representation of an object is determined in the up-block of the diffusion model in the early stage of the denoising process, and color adjustment can be achieved through value matrices alignment in the cross-attention layer. Based on our findings, we propose a straightforward, yet stable, and effective image-guided method to modify the color of an object without requiring any additional fine-tuning or training. Lastly, we present a benchmark dataset called COLORBENCH, the first benchmark to evaluate the performance of color change methods. Extensive experiments validate the effectiveness of our method in object-level color editing and surpass the performance of popular text-guided image editing approaches in both synthesized and real images. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10232v1-abstract-full').style.display = 'none'; document.getElementById('2411.10232v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Zhang%2C+Y&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Zhang%2C+Y&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+Y&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+Y&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+Y&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+Y&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository