Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 194 results for author: <span class="mathjax">Guan, J</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Guan%2C+J">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Guan, J"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Guan%2C+J&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Guan, J"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Guan%2C+J&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Guan%2C+J&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Guan%2C+J&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Guan%2C+J&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Guan%2C+J&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.19824">arXiv:2503.19824</a> <span> [<a href="https://arxiv.org/pdf/2503.19824">pdf</a>, <a href="https://arxiv.org/format/2503.19824">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> AudCast: Audio-Driven Human Video Generation by Cascaded Diffusion Transformers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guan%2C+J">Jiazhi Guan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+K">Kaisiyuan Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zhiliang Xu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Q">Quanwei Yang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yasheng Sun</a>, <a href="/search/cs?searchtype=author&query=He%2C+S">Shengyi He</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+B">Borong Liang</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+Y">Yukang Cao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yingying Li</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+H">Haocheng Feng</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+E">Errui Ding</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jingdong Wang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Youjian Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+H">Hang Zhou</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Ziwei Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.19824v1-abstract-short" style="display: inline;"> Despite the recent progress of audio-driven video generation, existing methods mostly focus on driving facial movements, leading to non-coherent head and body dynamics. Moving forward, it is desirable yet challenging to generate holistic human videos with both accurate lip-sync and delicate co-speech gestures w.r.t. given audio. In this work, we propose AudCast, a generalized audio-driven human vi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19824v1-abstract-full').style.display = 'inline'; document.getElementById('2503.19824v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.19824v1-abstract-full" style="display: none;"> Despite the recent progress of audio-driven video generation, existing methods mostly focus on driving facial movements, leading to non-coherent head and body dynamics. Moving forward, it is desirable yet challenging to generate holistic human videos with both accurate lip-sync and delicate co-speech gestures w.r.t. given audio. In this work, we propose AudCast, a generalized audio-driven human video generation framework adopting a cascade Diffusion-Transformers (DiTs) paradigm, which synthesizes holistic human videos based on a reference image and a given audio. 1) Firstly, an audio-conditioned Holistic Human DiT architecture is proposed to directly drive the movements of any human body with vivid gesture dynamics. 2) Then to enhance hand and face details that are well-knownly difficult to handle, a Regional Refinement DiT leverages regional 3D fitting as the bridge to reform the signals, producing the final results. Extensive experiments demonstrate that our framework generates high-fidelity audio-driven holistic human videos with temporal coherence and fine facial and hand details. Resources can be found at https://guanjz20.github.io/projects/AudCast. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19824v1-abstract-full').style.display = 'none'; document.getElementById('2503.19824v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR), 2025. Project page: https://guanjz20.github.io/projects/AudCast</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.17340">arXiv:2503.17340</a> <span> [<a href="https://arxiv.org/pdf/2503.17340">pdf</a>, <a href="https://arxiv.org/format/2503.17340">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Align Your Rhythm: Generating Highly Aligned Dance Poses with Gating-Enhanced Rhythm-Aware Feature Representation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fan%2C+C">Congyi Fan</a>, <a href="/search/cs?searchtype=author&query=Guan%2C+J">Jian Guan</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+X">Xuanjia Zhao</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+D">Dongli Xu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Youtian Lin</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+T">Tong Ye</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+P">Pengming Feng</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+H">Haiwei Pan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.17340v1-abstract-short" style="display: inline;"> Automatically generating natural, diverse and rhythmic human dance movements driven by music is vital for virtual reality and film industries. However, generating dance that naturally follows music remains a challenge, as existing methods lack proper beat alignment and exhibit unnatural motion dynamics. In this paper, we propose Danceba, a novel framework that leverages gating mechanism to enhance… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17340v1-abstract-full').style.display = 'inline'; document.getElementById('2503.17340v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.17340v1-abstract-full" style="display: none;"> Automatically generating natural, diverse and rhythmic human dance movements driven by music is vital for virtual reality and film industries. However, generating dance that naturally follows music remains a challenge, as existing methods lack proper beat alignment and exhibit unnatural motion dynamics. In this paper, we propose Danceba, a novel framework that leverages gating mechanism to enhance rhythm-aware feature representation for music-driven dance generation, which achieves highly aligned dance poses with enhanced rhythmic sensitivity. Specifically, we introduce Phase-Based Rhythm Extraction (PRE) to precisely extract rhythmic information from musical phase data, capitalizing on the intrinsic periodicity and temporal structures of music. Additionally, we propose Temporal-Gated Causal Attention (TGCA) to focus on global rhythmic features, ensuring that dance movements closely follow the musical rhythm. We also introduce Parallel Mamba Motion Modeling (PMMM) architecture to separately model upper and lower body motions along with musical features, thereby improving the naturalness and diversity of generated dance movements. Extensive experiments confirm that Danceba outperforms state-of-the-art methods, achieving significantly better rhythmic alignment and motion diversity. Project page: https://danceba.github.io/ . <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17340v1-abstract-full').style.display = 'none'; document.getElementById('2503.17340v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.17003">arXiv:2503.17003</a> <span> [<a href="https://arxiv.org/pdf/2503.17003">pdf</a>, <a href="https://arxiv.org/format/2503.17003">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> A Survey on Personalized Alignment -- The Missing Piece for Large Language Models in Real-World Applications </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guan%2C+J">Jian Guan</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Junfei Wu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jia-Nan Li</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+C">Chuanqi Cheng</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+W">Wei Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.17003v2-abstract-short" style="display: inline;"> Large Language Models (LLMs) have demonstrated remarkable capabilities, yet their transition to real-world applications reveals a critical limitation: the inability to adapt to individual preferences while maintaining alignment with universal human values. Current alignment techniques adopt a one-size-fits-all approach that fails to accommodate users' diverse backgrounds and needs. This paper pres… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17003v2-abstract-full').style.display = 'inline'; document.getElementById('2503.17003v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.17003v2-abstract-full" style="display: none;"> Large Language Models (LLMs) have demonstrated remarkable capabilities, yet their transition to real-world applications reveals a critical limitation: the inability to adapt to individual preferences while maintaining alignment with universal human values. Current alignment techniques adopt a one-size-fits-all approach that fails to accommodate users' diverse backgrounds and needs. This paper presents the first comprehensive survey of personalized alignment-a paradigm that enables LLMs to adapt their behavior within ethical boundaries based on individual preferences. We propose a unified framework comprising preference memory management, personalized generation, and feedback-based alignment, systematically analyzing implementation approaches and evaluating their effectiveness across various scenarios. By examining current techniques, potential risks, and future challenges, this survey provides a structured foundation for developing more adaptable and ethically-aligned LLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17003v2-abstract-full').style.display = 'none'; document.getElementById('2503.17003v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.15463">arXiv:2503.15463</a> <span> [<a href="https://arxiv.org/pdf/2503.15463">pdf</a>, <a href="https://arxiv.org/format/2503.15463">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> From 1,000,000 Users to Every User: Scaling Up Personalized Preference for User-level Alignment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+J">Jia-Nan Li</a>, <a href="/search/cs?searchtype=author&query=Guan%2C+J">Jian Guan</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+S">Songhao Wu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+W">Wei Wu</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+R">Rui Yan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.15463v2-abstract-short" style="display: inline;"> Large language models (LLMs) have traditionally been aligned through one-size-fits-all approaches that assume uniform human preferences, fundamentally overlooking the diversity in user values and needs. This paper introduces a comprehensive framework for scalable personalized alignment of LLMs. We establish a systematic preference space characterizing psychological and behavioral dimensions, along… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.15463v2-abstract-full').style.display = 'inline'; document.getElementById('2503.15463v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.15463v2-abstract-full" style="display: none;"> Large language models (LLMs) have traditionally been aligned through one-size-fits-all approaches that assume uniform human preferences, fundamentally overlooking the diversity in user values and needs. This paper introduces a comprehensive framework for scalable personalized alignment of LLMs. We establish a systematic preference space characterizing psychological and behavioral dimensions, alongside diverse persona representations for robust preference inference in real-world scenarios. Building upon this foundation, we introduce \textsc{AlignX}, a large-scale dataset of over 1.3 million personalized preference examples, and develop two complementary alignment approaches: \textit{in-context alignment} directly conditioning on persona representations and \textit{preference-bridged alignment} modeling intermediate preference distributions. Extensive experiments demonstrate substantial improvements over existing methods, with an average 17.06\% accuracy gain across four benchmarks while exhibiting a strong adaptation capability to novel preferences, robustness to limited user data, and precise preference controllability. These results validate our framework's effectiveness, advancing toward truly user-adaptive AI systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.15463v2-abstract-full').style.display = 'none'; document.getElementById('2503.15463v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.13848">arXiv:2503.13848</a> <span> [<a href="https://arxiv.org/pdf/2503.13848">pdf</a>, <a href="https://arxiv.org/format/2503.13848">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> FlexStep: Enabling Flexible Error Detection in Multi/Many-core Real-time Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+T">Tinglue Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yiming Li</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+W">Wei Tang</a>, <a href="/search/cs?searchtype=author&query=Guan%2C+J">Jiapeng Guan</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Z">Zhenghui Guo</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+R">Renshuang Jiang</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+R">Ran Wei</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jing Li</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Zhe Jiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.13848v1-abstract-short" style="display: inline;"> Reliability and real-time responsiveness in safety-critical systems have traditionally been achieved using error detection mechanisms, such as LockStep, which require pre-configured checker cores,strict synchronisation between main and checker cores, static error detection regions, or limited preemption capabilities. However, these core-bound hardware mechanisms often lead to significant resource… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.13848v1-abstract-full').style.display = 'inline'; document.getElementById('2503.13848v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.13848v1-abstract-full" style="display: none;"> Reliability and real-time responsiveness in safety-critical systems have traditionally been achieved using error detection mechanisms, such as LockStep, which require pre-configured checker cores,strict synchronisation between main and checker cores, static error detection regions, or limited preemption capabilities. However, these core-bound hardware mechanisms often lead to significant resource over-provisioning, and diminished real-time responsiveness, particularly in modern systems where tasks with varying reliability requirements are consolidated on shared processors to improve efficiency, reduce costs, and save power. To address these challenges, this work presents FlexStep, a systematic solution that integrates hardware and software across the SoC, ISA, and OS scheduling layers. FlexStep features a novel microarchitecture that supports dynamic core configuration and asynchronous, preemptive error detection. The FlexStep architecture naturally allows for flexible task scheduling and error detection, enabling new scheduling algorithms that enhance both resource efficiency and real-time schedulability. We publicly release FlexStep's source code, at https://anonymous.4open.science/r/FlexStep-DAC25-7B0C. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.13848v1-abstract-full').style.display = 'none'; document.getElementById('2503.13848v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.09942">arXiv:2503.09942</a> <span> [<a href="https://arxiv.org/pdf/2503.09942">pdf</a>, <a href="https://arxiv.org/format/2503.09942">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Cosh-DiT: Co-Speech Gesture Video Synthesis via Hybrid Audio-Visual Diffusion Transformers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yasheng Sun</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zhiliang Xu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+H">Hang Zhou</a>, <a href="/search/cs?searchtype=author&query=Guan%2C+J">Jiazhi Guan</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Q">Quanwei Yang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+K">Kaisiyuan Wang</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+B">Borong Liang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yingying Li</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+H">Haocheng Feng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jingdong Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Ziwei Liu</a>, <a href="/search/cs?searchtype=author&query=Hideki%2C+K">Koike Hideki</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.09942v1-abstract-short" style="display: inline;"> Co-speech gesture video synthesis is a challenging task that requires both probabilistic modeling of human gestures and the synthesis of realistic images that align with the rhythmic nuances of speech. To address these challenges, we propose Cosh-DiT, a Co-speech gesture video system with hybrid Diffusion Transformers that perform audio-to-motion and motion-to-video synthesis using discrete and co… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.09942v1-abstract-full').style.display = 'inline'; document.getElementById('2503.09942v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.09942v1-abstract-full" style="display: none;"> Co-speech gesture video synthesis is a challenging task that requires both probabilistic modeling of human gestures and the synthesis of realistic images that align with the rhythmic nuances of speech. To address these challenges, we propose Cosh-DiT, a Co-speech gesture video system with hybrid Diffusion Transformers that perform audio-to-motion and motion-to-video synthesis using discrete and continuous diffusion modeling, respectively. First, we introduce an audio Diffusion Transformer (Cosh-DiT-A) to synthesize expressive gesture dynamics synchronized with speech rhythms. To capture upper body, facial, and hand movement priors, we employ vector-quantized variational autoencoders (VQ-VAEs) to jointly learn their dependencies within a discrete latent space. Then, for realistic video synthesis conditioned on the generated speech-driven motion, we design a visual Diffusion Transformer (Cosh-DiT-V) that effectively integrates spatial and temporal contexts. Extensive experiments demonstrate that our framework consistently generates lifelike videos with expressive facial expressions and natural, smooth gestures that align seamlessly with speech. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.09942v1-abstract-full').style.display = 'none'; document.getElementById('2503.09942v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project Page: https://sunyasheng.github.io/projects/COSH-DIT</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.09567">arXiv:2503.09567</a> <span> [<a href="https://arxiv.org/pdf/2503.09567">pdf</a>, <a href="https://arxiv.org/format/2503.09567">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Towards Reasoning Era: A Survey of Long Chain-of-Thought for Reasoning Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Q">Qiguang Chen</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+L">Libo Qin</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jinhao Liu</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+D">Dengyun Peng</a>, <a href="/search/cs?searchtype=author&query=Guan%2C+J">Jiannan Guan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+P">Peng Wang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+M">Mengkang Hu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yuhang Zhou</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+T">Te Gao</a>, <a href="/search/cs?searchtype=author&query=Che%2C+W">Wanxiang Che</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.09567v2-abstract-short" style="display: inline;"> Recent advancements in reasoning with large language models (RLLMs), such as OpenAI-O1 and DeepSeek-R1, have demonstrated their impressive capabilities in complex domains like mathematics and coding. A central factor in their success lies in the application of long chain-of-thought (Long CoT) characteristics, which enhance reasoning abilities and enable the solution of intricate problems. However,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.09567v2-abstract-full').style.display = 'inline'; document.getElementById('2503.09567v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.09567v2-abstract-full" style="display: none;"> Recent advancements in reasoning with large language models (RLLMs), such as OpenAI-O1 and DeepSeek-R1, have demonstrated their impressive capabilities in complex domains like mathematics and coding. A central factor in their success lies in the application of long chain-of-thought (Long CoT) characteristics, which enhance reasoning abilities and enable the solution of intricate problems. However, despite these developments, a comprehensive survey on Long CoT is still lacking, limiting our understanding of its distinctions from traditional short chain-of-thought (Short CoT) and complicating ongoing debates on issues like "overthinking" and "test-time scaling." This survey seeks to fill this gap by offering a unified perspective on Long CoT. (1) We first distinguish Long CoT from Short CoT and introduce a novel taxonomy to categorize current reasoning paradigms. (2) Next, we explore the key characteristics of Long CoT: deep reasoning, extensive exploration, and feasible reflection, which enable models to handle more complex tasks and produce more efficient, coherent outcomes compared to the shallower Short CoT. (3) We then investigate key phenomena such as the emergence of Long CoT with these characteristics, including overthinking, and test-time scaling, offering insights into how these processes manifest in practice. (4) Finally, we identify significant research gaps and highlight promising future directions, including the integration of multi-modal reasoning, efficiency improvements, and enhanced knowledge frameworks. By providing a structured overview, this survey aims to inspire future research and further the development of logical reasoning in artificial intelligence. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.09567v2-abstract-full').style.display = 'none'; document.getElementById('2503.09567v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Paper are available at https://long-cot.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.03989">arXiv:2503.03989</a> <span> [<a href="https://arxiv.org/pdf/2503.03989">pdf</a>, <a href="https://arxiv.org/format/2503.03989">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Biomolecules">q-bio.BM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Integrating Protein Dynamics into Structure-Based Drug Design via Full-Atom Stochastic Flows </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+X">Xiangxin Zhou</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+Y">Yi Xiao</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H">Haowei Lin</a>, <a href="/search/cs?searchtype=author&query=He%2C+X">Xinheng He</a>, <a href="/search/cs?searchtype=author&query=Guan%2C+J">Jiaqi Guan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yang Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qiang Liu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+F">Feng Zhou</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Liang Wang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+J">Jianzhu Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.03989v1-abstract-short" style="display: inline;"> The dynamic nature of proteins, influenced by ligand interactions, is essential for comprehending protein function and progressing drug discovery. Traditional structure-based drug design (SBDD) approaches typically target binding sites with rigid structures, limiting their practical application in drug development. While molecular dynamics simulation can theoretically capture all the biologically… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.03989v1-abstract-full').style.display = 'inline'; document.getElementById('2503.03989v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.03989v1-abstract-full" style="display: none;"> The dynamic nature of proteins, influenced by ligand interactions, is essential for comprehending protein function and progressing drug discovery. Traditional structure-based drug design (SBDD) approaches typically target binding sites with rigid structures, limiting their practical application in drug development. While molecular dynamics simulation can theoretically capture all the biologically relevant conformations, the transition rate is dictated by the intrinsic energy barrier between them, making the sampling process computationally expensive. To overcome the aforementioned challenges, we propose to use generative modeling for SBDD considering conformational changes of protein pockets. We curate a dataset of apo and multiple holo states of protein-ligand complexes, simulated by molecular dynamics, and propose a full-atom flow model (and a stochastic version), named DynamicFlow, that learns to transform apo pockets and noisy ligands into holo pockets and corresponding 3D ligand molecules. Our method uncovers promising ligand molecules and corresponding holo conformations of pockets. Additionally, the resultant holo-like states provide superior inputs for traditional SBDD approaches, playing a significant role in practical drug discovery. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.03989v1-abstract-full').style.display = 'none'; document.getElementById('2503.03989v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ICLR 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.02324">arXiv:2503.02324</a> <span> [<a href="https://arxiv.org/pdf/2503.02324">pdf</a>, <a href="https://arxiv.org/format/2503.02324">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> PromptCoT: Synthesizing Olympiad-level Problems for Mathematical Reasoning in Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+X">Xueliang Zhao</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+W">Wei Wu</a>, <a href="/search/cs?searchtype=author&query=Guan%2C+J">Jian Guan</a>, <a href="/search/cs?searchtype=author&query=Kong%2C+L">Lingpeng Kong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.02324v1-abstract-short" style="display: inline;"> The ability of large language models to solve complex mathematical problems has progressed significantly, particularly for tasks requiring advanced reasoning. However, the scarcity of sufficiently challenging problems, particularly at the Olympiad level, hinders further advancements. In this work, we introduce PromptCoT, a novel approach for automatically generating high-quality Olympiad-level mat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.02324v1-abstract-full').style.display = 'inline'; document.getElementById('2503.02324v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.02324v1-abstract-full" style="display: none;"> The ability of large language models to solve complex mathematical problems has progressed significantly, particularly for tasks requiring advanced reasoning. However, the scarcity of sufficiently challenging problems, particularly at the Olympiad level, hinders further advancements. In this work, we introduce PromptCoT, a novel approach for automatically generating high-quality Olympiad-level math problems. The proposed method synthesizes complex problems based on mathematical concepts and the rationale behind problem construction, emulating the thought processes of experienced problem designers. We provide a theoretical analysis demonstrating that an optimal rationale should maximize both the likelihood of rationale generation given the associated concepts and the likelihood of problem generation conditioned on both the rationale and the concepts. Our method is evaluated on standard benchmarks including GSM8K, MATH-500, and AIME2024, where it consistently outperforms existing problem generation methods. Furthermore, we demonstrate that PromptCoT exhibits superior data scalability, consistently maintaining high performance as the dataset size increases, outperforming the baselines. The implementation is available at https://github.com/zhaoxlpku/PromptCoT. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.02324v1-abstract-full').style.display = 'none'; document.getElementById('2503.02324v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Preprint</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.13873">arXiv:2502.13873</a> <span> [<a href="https://arxiv.org/pdf/2502.13873">pdf</a>, <a href="https://arxiv.org/format/2502.13873">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> NVR: Vector Runahead on NPUs for Sparse Memory Access </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hui Wang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Zhengpeng Zhao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jing Wang</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Y">Yushu Du</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+Y">Yuan Cheng</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+B">Bing Guo</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+H">He Xiao</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+C">Chenhao Ma</a>, <a href="/search/cs?searchtype=author&query=Han%2C+X">Xiaomeng Han</a>, <a href="/search/cs?searchtype=author&query=You%2C+D">Dean You</a>, <a href="/search/cs?searchtype=author&query=Guan%2C+J">Jiapeng Guan</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+R">Ran Wei</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+D">Dawei Yang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Zhe Jiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.13873v2-abstract-short" style="display: inline;"> Deep Neural Networks are increasingly leveraging sparsity to reduce the scaling up of model parameter size. However, reducing wall-clock time through sparsity and pruning remains challenging due to irregular memory access patterns, leading to frequent cache misses. In this paper, we present NPU Vector Runahead (NVR), a prefetching mechanism tailored for NPUs to address cache miss problems in spars… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13873v2-abstract-full').style.display = 'inline'; document.getElementById('2502.13873v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.13873v2-abstract-full" style="display: none;"> Deep Neural Networks are increasingly leveraging sparsity to reduce the scaling up of model parameter size. However, reducing wall-clock time through sparsity and pruning remains challenging due to irregular memory access patterns, leading to frequent cache misses. In this paper, we present NPU Vector Runahead (NVR), a prefetching mechanism tailored for NPUs to address cache miss problems in sparse DNN workloads. Rather than optimising memory patterns with high overhead and poor portability, NVR adapts runahead execution to the unique architecture of NPUs. NVR provides a general micro-architectural solution for sparse DNN workloads without requiring compiler or algorithmic support, operating as a decoupled, speculative, lightweight hardware sub-thread alongside the NPU, with minimal hardware overhead (under 5%). NVR achieves an average 90% reduction in cache misses compared to SOTA prefetching in general-purpose processors, delivering 4x average speedup on sparse workloads versus NPUs without prefetching. Moreover, we investigate the advantages of incorporating a small cache (16KB) into the NPU combined with NVR. Our evaluation shows that expanding this modest cache delivers 5x higher performance benefits than increasing the L2 cache size by the same amount. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13873v2-abstract-full').style.display = 'none'; document.getElementById('2502.13873v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.13191">arXiv:2502.13191</a> <span> [<a href="https://arxiv.org/pdf/2502.13191">pdf</a>, <a href="https://arxiv.org/format/2502.13191">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> On the Privacy Risks of Spiking Neural Networks: A Membership Inference Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guan%2C+J">Junyi Guan</a>, <a href="/search/cs?searchtype=author&query=Sharma%2C+A">Abhijith Sharma</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+C">Chong Tian</a>, <a href="/search/cs?searchtype=author&query=Lahlou%2C+S">Salem Lahlou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.13191v2-abstract-short" style="display: inline;"> Spiking Neural Networks (SNNs) are increasingly explored for their energy efficiency and robustness in real-world applications, yet their privacy risks remain largely unexamined. In this work, we investigate the susceptibility of SNNs to Membership Inference Attacks (MIAs) -- a major privacy threat where an adversary attempts to determine whether a given sample was part of the training dataset. Wh… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13191v2-abstract-full').style.display = 'inline'; document.getElementById('2502.13191v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.13191v2-abstract-full" style="display: none;"> Spiking Neural Networks (SNNs) are increasingly explored for their energy efficiency and robustness in real-world applications, yet their privacy risks remain largely unexamined. In this work, we investigate the susceptibility of SNNs to Membership Inference Attacks (MIAs) -- a major privacy threat where an adversary attempts to determine whether a given sample was part of the training dataset. While prior work suggests that SNNs may offer inherent robustness due to their discrete, event-driven nature, we find that its resilience diminishes as latency (T) increases. Furthermore, we introduce an input dropout strategy under black box setting, that significantly enhances membership inference in SNNs. Our findings challenge the assumption that SNNs are inherently more secure, and even though they are expected to be better, our results reveal that SNNs exhibit privacy vulnerabilities that are equally comparable to Artificial Neural Networks (ANNs). Our code is available at https://anonymous.4open.science/r/MIA_SNN-3610. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13191v2-abstract-full').style.display = 'none'; document.getElementById('2502.13191v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.09622">arXiv:2502.09622</a> <span> [<a href="https://arxiv.org/pdf/2502.09622">pdf</a>, <a href="https://arxiv.org/format/2502.09622">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Theoretical Benefit and Limitation of Diffusion Language Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Feng%2C+G">Guhao Feng</a>, <a href="/search/cs?searchtype=author&query=Geng%2C+Y">Yihan Geng</a>, <a href="/search/cs?searchtype=author&query=Guan%2C+J">Jian Guan</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+W">Wei Wu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Liwei Wang</a>, <a href="/search/cs?searchtype=author&query=He%2C+D">Di He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.09622v1-abstract-short" style="display: inline;"> Diffusion language models have emerged as a promising approach for text generation. One would naturally expect this method to be an efficient replacement for autoregressive models since multiple tokens can be sampled in parallel during each diffusion step. However, its efficiency-accuracy trade-off is not yet well understood. In this paper, we present a rigorous theoretical analysis of a widely us… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09622v1-abstract-full').style.display = 'inline'; document.getElementById('2502.09622v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.09622v1-abstract-full" style="display: none;"> Diffusion language models have emerged as a promising approach for text generation. One would naturally expect this method to be an efficient replacement for autoregressive models since multiple tokens can be sampled in parallel during each diffusion step. However, its efficiency-accuracy trade-off is not yet well understood. In this paper, we present a rigorous theoretical analysis of a widely used type of diffusion language model, the Masked Diffusion Model (MDM), and find that its effectiveness heavily depends on the target evaluation metric. Under mild conditions, we prove that when using perplexity as the metric, MDMs can achieve near-optimal perplexity in sampling steps regardless of sequence length, demonstrating that efficiency can be achieved without sacrificing performance. However, when using the sequence error rate--which is important for understanding the "correctness" of a sequence, such as a reasoning chain--we show that the required sampling steps must scale linearly with sequence length to obtain "correct" sequences, thereby eliminating MDM's efficiency advantage over autoregressive models. Our analysis establishes the first theoretical foundation for understanding the benefits and limitations of MDMs. All theoretical findings are supported by empirical studies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09622v1-abstract-full').style.display = 'none'; document.getElementById('2502.09622v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">32 pages, 3 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07663">arXiv:2502.07663</a> <span> [<a href="https://arxiv.org/pdf/2502.07663">pdf</a>, <a href="https://arxiv.org/format/2502.07663">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Human Decision-making is Susceptible to AI-driven Manipulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sabour%2C+S">Sahand Sabour</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J+M">June M. Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Siyang Liu</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+C+Z">Chris Z. Yao</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+S">Shiyao Cui</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xuanming Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wen Zhang</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+Y">Yaru Cao</a>, <a href="/search/cs?searchtype=author&query=Bhat%2C+A">Advait Bhat</a>, <a href="/search/cs?searchtype=author&query=Guan%2C+J">Jian Guan</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+W">Wei Wu</a>, <a href="/search/cs?searchtype=author&query=Mihalcea%2C+R">Rada Mihalcea</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hongning Wang</a>, <a href="/search/cs?searchtype=author&query=Althoff%2C+T">Tim Althoff</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+T+M+C">Tatia M. C. Lee</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+M">Minlie Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07663v2-abstract-short" style="display: inline;"> Artificial Intelligence (AI) systems are increasingly intertwined with daily life, assisting users in executing various tasks and providing guidance on decision-making. This integration introduces risks of AI-driven manipulation, where such systems may exploit users' cognitive biases and emotional vulnerabilities to steer them toward harmful outcomes. Through a randomized controlled trial with 233… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07663v2-abstract-full').style.display = 'inline'; document.getElementById('2502.07663v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07663v2-abstract-full" style="display: none;"> Artificial Intelligence (AI) systems are increasingly intertwined with daily life, assisting users in executing various tasks and providing guidance on decision-making. This integration introduces risks of AI-driven manipulation, where such systems may exploit users' cognitive biases and emotional vulnerabilities to steer them toward harmful outcomes. Through a randomized controlled trial with 233 participants, we examined human susceptibility to such manipulation in financial (e.g., purchases) and emotional (e.g., conflict resolution) decision-making contexts. Participants interacted with one of three AI agents: a neutral agent (NA) optimizing for user benefit without explicit influence, a manipulative agent (MA) designed to covertly influence beliefs and behaviors, or a strategy-enhanced manipulative agent (SEMA) employing explicit psychological tactics to reach its hidden objectives. By analyzing participants' decision patterns and shifts in their preference ratings post-interaction, we found significant susceptibility to AI-driven manipulation. Particularly, across both decision-making domains, participants interacting with the manipulative agents shifted toward harmful options at substantially higher rates (financial, MA: 62.3%, SEMA: 59.6%; emotional, MA: 42.3%, SEMA: 41.5%) compared to the NA group (financial, 35.8%; emotional, 12.8%). Notably, our findings reveal that even subtle manipulative objectives (MA) can be as effective as employing explicit psychological strategies (SEMA) in swaying human decision-making. By revealing the potential for covert AI influence, this study highlights a critical vulnerability in human-AI interactions, emphasizing the need for ethical safeguards and regulatory frameworks to ensure responsible deployment of AI technologies and protect human autonomy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07663v2-abstract-full').style.display = 'none'; document.getElementById('2502.07663v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Work in progress</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.15055">arXiv:2501.15055</a> <span> [<a href="https://arxiv.org/pdf/2501.15055">pdf</a>, <a href="https://arxiv.org/format/2501.15055">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Biomolecules">q-bio.BM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Group Ligands Docking to Protein Pockets </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guan%2C+J">Jiaqi Guan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jiahan Li</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+X">Xiangxin Zhou</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+X">Xingang Peng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Sheng Wang</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+Y">Yunan Luo</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+J">Jian Peng</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+J">Jianzhu Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.15055v1-abstract-short" style="display: inline;"> Molecular docking is a key task in computational biology that has attracted increasing interest from the machine learning community. While existing methods have achieved success, they generally treat each protein-ligand pair in isolation. Inspired by the biochemical observation that ligands binding to the same target protein tend to adopt similar poses, we propose \textsc{GroupBind}, a novel molec… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15055v1-abstract-full').style.display = 'inline'; document.getElementById('2501.15055v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.15055v1-abstract-full" style="display: none;"> Molecular docking is a key task in computational biology that has attracted increasing interest from the machine learning community. While existing methods have achieved success, they generally treat each protein-ligand pair in isolation. Inspired by the biochemical observation that ligands binding to the same target protein tend to adopt similar poses, we propose \textsc{GroupBind}, a novel molecular docking framework that simultaneously considers multiple ligands docking to a protein. This is achieved by introducing an interaction layer for the group of ligands and a triangle attention module for embedding protein-ligand and group-ligand pairs. By integrating our approach with diffusion-based docking model, we set a new S performance on the PDBBind blind docking benchmark, demonstrating the effectiveness of our proposed molecular docking paradigm. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15055v1-abstract-full').style.display = 'none'; document.getElementById('2501.15055v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">18 pages, published in ICLR 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.01604">arXiv:2501.01604</a> <span> [<a href="https://arxiv.org/pdf/2501.01604">pdf</a>, <a href="https://arxiv.org/format/2501.01604">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Disentangling Hierarchical Features for Anomalous Sound Detection Under Domain Shift </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guan%2C+J">Jian Guan</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+J">Jiantong Tian</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Q">Qiaoxi Zhu</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+F">Feiyang Xiao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hejing Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xubo Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.01604v1-abstract-short" style="display: inline;"> Anomalous sound detection (ASD) encounters difficulties with domain shift, where the sounds of machines in target domains differ significantly from those in source domains due to varying operating conditions. Existing methods typically employ domain classifiers to enhance detection performance, but they often overlook the influence of domain-unrelated information. This oversight can hinder the mod… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.01604v1-abstract-full').style.display = 'inline'; document.getElementById('2501.01604v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.01604v1-abstract-full" style="display: none;"> Anomalous sound detection (ASD) encounters difficulties with domain shift, where the sounds of machines in target domains differ significantly from those in source domains due to varying operating conditions. Existing methods typically employ domain classifiers to enhance detection performance, but they often overlook the influence of domain-unrelated information. This oversight can hinder the model's ability to clearly distinguish between domains, thereby weakening its capacity to differentiate normal from abnormal sounds. In this paper, we propose a Gradient Reversal-based Hierarchical feature Disentanglement (GRHD) method to address the above challenge. GRHD uses gradient reversal to separate domain-related features from domain-unrelated ones, resulting in more robust feature representations. Additionally, the method employs a hierarchical structure to guide the learning of fine-grained, domain-specific features by leveraging available metadata, such as section IDs and machine sound attributes. Experimental results on the DCASE 2022 Challenge Task 2 dataset demonstrate that the proposed method significantly improves ASD performance under domain shift. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.01604v1-abstract-full').style.display = 'none'; document.getElementById('2501.01604v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICASSP 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.20768">arXiv:2412.20768</a> <span> [<a href="https://arxiv.org/pdf/2412.20768">pdf</a>, <a href="https://arxiv.org/format/2412.20768">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Sample Correlation for Fingerprinting Deep Face Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guan%2C+J">Jiyang Guan</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+J">Jian Liang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yanbo Wang</a>, <a href="/search/cs?searchtype=author&query=He%2C+R">Ran He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.20768v1-abstract-short" style="display: inline;"> Face recognition has witnessed remarkable advancements in recent years, thanks to the development of deep learning techniques.However, an off-the-shelf face recognition model as a commercial service could be stolen by model stealing attacks, posing great threats to the rights of the model owner.Model fingerprinting, as a model stealing detection method, aims to verify whether a suspect model is st… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.20768v1-abstract-full').style.display = 'inline'; document.getElementById('2412.20768v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.20768v1-abstract-full" style="display: none;"> Face recognition has witnessed remarkable advancements in recent years, thanks to the development of deep learning techniques.However, an off-the-shelf face recognition model as a commercial service could be stolen by model stealing attacks, posing great threats to the rights of the model owner.Model fingerprinting, as a model stealing detection method, aims to verify whether a suspect model is stolen from the victim model, gaining more and more attention nowadays.Previous methods always utilize transferable adversarial examples as the model fingerprint, but this method is known to be sensitive to adversarial defense and transfer learning techniques.To address this issue, we consider the pairwise relationship between samples instead and propose a novel yet simple model stealing detection method based on SAmple Correlation (SAC).Specifically, we present SAC-JC that selects JPEG compressed samples as model inputs and calculates the correlation matrix among their model outputs.Extensive results validate that SAC successfully defends against various model stealing attacks in deep face recognition, encompassing face verification and face emotion recognition, exhibiting the highest performance in terms of AUC, p-value and F1 score.Furthermore, we extend our evaluation of SAC-JC to object recognition datasets including Tiny-ImageNet and CIFAR10, which also demonstrates the superior performance of SAC-JC to previous methods.The code will be available at \url{https://github.com/guanjiyang/SAC_JC}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.20768v1-abstract-full').style.display = 'none'; document.getElementById('2412.20768v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.20211">arXiv:2412.20211</a> <span> [<a href="https://arxiv.org/pdf/2412.20211">pdf</a>, <a href="https://arxiv.org/format/2412.20211">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Sequence Generation Modeling for Continuous Value Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ma%2C+H">Hongxu Ma</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+K">Kai Tian</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tao Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xuefeng Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C">Chunjie Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Han Li</a>, <a href="/search/cs?searchtype=author&query=Guan%2C+J">Jihong Guan</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+S">Shuigeng Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.20211v2-abstract-short" style="display: inline;"> Continuous value prediction (CVP) plays a crucial role in short video recommendation, capturing user preferences through precise numerical estimations. However, traditional regression-based methods often struggle with challenges like wide value ranges and imbalanced data, leading to prediction bias. While ordinal classification approaches have been introduced to address these issues, their relianc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.20211v2-abstract-full').style.display = 'inline'; document.getElementById('2412.20211v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.20211v2-abstract-full" style="display: none;"> Continuous value prediction (CVP) plays a crucial role in short video recommendation, capturing user preferences through precise numerical estimations. However, traditional regression-based methods often struggle with challenges like wide value ranges and imbalanced data, leading to prediction bias. While ordinal classification approaches have been introduced to address these issues, their reliance on discretization reduces accuracy and overlooks inherent relationships between intervals. To overcome these limitations, we introduce a novel Generative Regression (GR) framework for CVP, inspired by sequence generation techniques in language modeling. Our method transforms numerical values into token sequences through structural discretization, preserving original data fidelity while improving prediction precision. Leveraging a carefully crafted vocabulary and label encoding, GR employs curriculum learning with an embedding mixup strategy to bridge training-inference gaps. Experimental evaluations on four public datasets and one large-scale industrial dataset validate the superiority of GR over existing methods. Real-world A/B tests on Kuaishou, a leading video platform, further demonstrate its practical effectiveness. Additionally, GR proves adaptable to other regression tasks, such as Lifetime Value (LTV) prediction, showcasing its potential as a robust solution for diverse CVP challenges. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.20211v2-abstract-full').style.display = 'none'; document.getElementById('2412.20211v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 5 figures, conference or other essential info</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.19404">arXiv:2412.19404</a> <span> [<a href="https://arxiv.org/pdf/2412.19404">pdf</a>, <a href="https://arxiv.org/format/2412.19404">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Spectral-Temporal Fusion Representation for Person-in-Bed Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xuefeng Yang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shiheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Guan%2C+J">Jian Guan</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+F">Feiyang Xiao</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+W">Wei Lu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Q">Qiaoxi Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.19404v1-abstract-short" style="display: inline;"> This study is based on the ICASSP 2025 Signal Processing Grand Challenge's Accelerometer-Based Person-in-Bed Detection Challenge, which aims to determine bed occupancy using accelerometer signals. The task is divided into two tracks: "in bed" and "not in bed" segmented detection, and streaming detection, facing challenges such as individual differences, posture variations, and external disturbance… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.19404v1-abstract-full').style.display = 'inline'; document.getElementById('2412.19404v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.19404v1-abstract-full" style="display: none;"> This study is based on the ICASSP 2025 Signal Processing Grand Challenge's Accelerometer-Based Person-in-Bed Detection Challenge, which aims to determine bed occupancy using accelerometer signals. The task is divided into two tracks: "in bed" and "not in bed" segmented detection, and streaming detection, facing challenges such as individual differences, posture variations, and external disturbances. We propose a spectral-temporal fusion-based feature representation method with mixup data augmentation, and adopt Intersection over Union (IoU) loss to optimize detection accuracy. In the two tracks, our method achieved outstanding results of 100.00% and 95.55% in detection scores, securing first place and third place, respectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.19404v1-abstract-full').style.display = 'none'; document.getElementById('2412.19404v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.19068">arXiv:2412.19068</a> <span> [<a href="https://arxiv.org/pdf/2412.19068">pdf</a>, <a href="https://arxiv.org/ps/2412.19068">ps</a>, <a href="https://arxiv.org/format/2412.19068">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Attacking Voice Anonymization Systems with Augmented Feature and Speaker Identity Difference </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yanzhe Zhang</a>, <a href="/search/cs?searchtype=author&query=Bi%2C+Z">Zhonghao Bi</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+F">Feiyang Xiao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xuefeng Yang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Q">Qiaoxi Zhu</a>, <a href="/search/cs?searchtype=author&query=Guan%2C+J">Jian Guan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.19068v2-abstract-short" style="display: inline;"> This study focuses on the First VoicePrivacy Attacker Challenge within the ICASSP 2025 Signal Processing Grand Challenge, which aims to develop speaker verification systems capable of determining whether two anonymized speech signals are from the same speaker. However, differences between feature distributions of original and anonymized speech complicate this task. To address this challenge, we pr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.19068v2-abstract-full').style.display = 'inline'; document.getElementById('2412.19068v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.19068v2-abstract-full" style="display: none;"> This study focuses on the First VoicePrivacy Attacker Challenge within the ICASSP 2025 Signal Processing Grand Challenge, which aims to develop speaker verification systems capable of determining whether two anonymized speech signals are from the same speaker. However, differences between feature distributions of original and anonymized speech complicate this task. To address this challenge, we propose an attacker system that combines Data Augmentation enhanced feature representation and Speaker Identity Difference enhanced classifier to improve verification performance, termed DA-SID. Specifically, data augmentation strategies (i.e., data fusion and SpecAugment) are utilized to mitigate feature distribution gaps, while probabilistic linear discriminant analysis (PLDA) is employed to further enhance speaker identity difference. Our system significantly outperforms the baseline, demonstrating exceptional effectiveness and robustness against various voice anonymization systems, ultimately securing a top-5 ranking in the challenge. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.19068v2-abstract-full').style.display = 'none'; document.getElementById('2412.19068v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">2 pages, submitted to ICASSP 2025 GC-7: The First VoicePrivacy Attacker Challenge (by invitation), fixed a numerical typo: In Table II, the EER% for DA-SID w/o DA under T8-5 is corrected to 26.96</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.18235">arXiv:2412.18235</a> <span> [<a href="https://arxiv.org/pdf/2412.18235">pdf</a>, <a href="https://arxiv.org/format/2412.18235">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Band Prompting Aided SAR and Multi-Spectral Data Fusion Framework for Local Climate Zone Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lan%2C+H">Haiyan Lan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shujun Li</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+M">Mingjie Xie</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+X">Xuanjia Zhao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Hongning Liu</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+P">Pengming Feng</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+D">Dongli Xu</a>, <a href="/search/cs?searchtype=author&query=He%2C+G">Guangjun He</a>, <a href="/search/cs?searchtype=author&query=Guan%2C+J">Jian Guan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.18235v1-abstract-short" style="display: inline;"> Local climate zone (LCZ) classification is of great value for understanding the complex interactions between urban development and local climate. Recent studies have increasingly focused on the fusion of synthetic aperture radar (SAR) and multi-spectral data to improve LCZ classification performance. However, it remains challenging due to the distinct physical properties of these two types of data… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18235v1-abstract-full').style.display = 'inline'; document.getElementById('2412.18235v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.18235v1-abstract-full" style="display: none;"> Local climate zone (LCZ) classification is of great value for understanding the complex interactions between urban development and local climate. Recent studies have increasingly focused on the fusion of synthetic aperture radar (SAR) and multi-spectral data to improve LCZ classification performance. However, it remains challenging due to the distinct physical properties of these two types of data and the absence of effective fusion guidance. In this paper, a novel band prompting aided data fusion framework is proposed for LCZ classification, namely BP-LCZ, which utilizes textual prompts associated with band groups to guide the model in learning the physical attributes of different bands and semantics of various categories inherent in SAR and multi-spectral data to augment the fused feature, thus enhancing LCZ classification performance. Specifically, a band group prompting (BGP) strategy is introduced to align the visual representation effectively at the level of band groups, which also facilitates a more adequate extraction of semantic information of different bands with textual information. In addition, a multivariate supervised matrix (MSM) based training strategy is proposed to alleviate the problem of positive and negative sample confusion by completing the supervised information. The experimental results demonstrate the effectiveness and superiority of the proposed data fusion framework. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18235v1-abstract-full').style.display = 'none'; document.getElementById('2412.18235v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICASSP 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.17717">arXiv:2412.17717</a> <span> [<a href="https://arxiv.org/pdf/2412.17717">pdf</a>, <a href="https://arxiv.org/format/2412.17717">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Fast Causal Discovery by Approximate Kernel-based Generalized Score Functions with Linear Computational Complexity </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ren%2C+Y">Yixin Ren</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Haocheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+Y">Yewei Xia</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hao Zhang</a>, <a href="/search/cs?searchtype=author&query=Guan%2C+J">Jihong Guan</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+S">Shuigeng Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.17717v1-abstract-short" style="display: inline;"> Score-based causal discovery methods can effectively identify causal relationships by evaluating candidate graphs and selecting the one with the highest score. One popular class of scores is kernel-based generalized score functions, which can adapt to a wide range of scenarios and work well in practice because they circumvent assumptions about causal mechanisms and data distributions. Despite thes… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.17717v1-abstract-full').style.display = 'inline'; document.getElementById('2412.17717v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.17717v1-abstract-full" style="display: none;"> Score-based causal discovery methods can effectively identify causal relationships by evaluating candidate graphs and selecting the one with the highest score. One popular class of scores is kernel-based generalized score functions, which can adapt to a wide range of scenarios and work well in practice because they circumvent assumptions about causal mechanisms and data distributions. Despite these advantages, kernel-based generalized score functions pose serious computational challenges in time and space, with a time complexity of $\mathcal{O}(n^3)$ and a memory complexity of $\mathcal{O}(n^2)$, where $n$ is the sample size. In this paper, we propose an approximate kernel-based generalized score function with $\mathcal{O}(n)$ time and space complexities by using low-rank technique and designing a set of rules to handle the complex composite matrix operations required to calculate the score, as well as developing sampling algorithms for different data types to benefit the handling of diverse data types efficiently. Our extensive causal discovery experiments on both synthetic and real-world data demonstrate that compared to the state-of-the-art method, our method can not only significantly reduce computational costs, but also achieve comparable accuracy, especially for large datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.17717v1-abstract-full').style.display = 'none'; document.getElementById('2412.17717v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.15365">arXiv:2412.15365</a> <span> [<a href="https://arxiv.org/pdf/2412.15365">pdf</a>, <a href="https://arxiv.org/format/2412.15365">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> LISA: Learning-Integrated Space Partitioning Framework for Traffic Accident Forecasting on Heterogeneous Spatiotemporal Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=An%2C+B">Bang An</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+X">Xun Zhou</a>, <a href="/search/cs?searchtype=author&query=Vahedian%2C+A">Amin Vahedian</a>, <a href="/search/cs?searchtype=author&query=Street%2C+N">Nick Street</a>, <a href="/search/cs?searchtype=author&query=Guan%2C+J">Jinping Guan</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+J">Jun Luo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.15365v1-abstract-short" style="display: inline;"> Traffic accident forecasting is an important task for intelligent transportation management and emergency response systems. However, this problem is challenging due to the spatial heterogeneity of the environment. Existing data-driven methods mostly focus on studying homogeneous areas with limited size (e.g. a single urban area such as New York City) and fail to handle the heterogeneous accident p… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.15365v1-abstract-full').style.display = 'inline'; document.getElementById('2412.15365v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.15365v1-abstract-full" style="display: none;"> Traffic accident forecasting is an important task for intelligent transportation management and emergency response systems. However, this problem is challenging due to the spatial heterogeneity of the environment. Existing data-driven methods mostly focus on studying homogeneous areas with limited size (e.g. a single urban area such as New York City) and fail to handle the heterogeneous accident patterns over space at different scales. Recent advances (e.g. spatial ensemble) utilize pre-defined space partitions and learn multiple models to improve prediction accuracy. However, external knowledge is required to define proper space partitions before training models and pre-defined partitions may not necessarily reduce the heterogeneity. To address this issue, we propose a novel Learning-Integrated Space Partition Framework (LISA) to simultaneously learn partitions while training models, where the partitioning process and learning process are integrated in a way that partitioning is guided explicitly by prediction accuracy rather than other factors. Experiments using real-world datasets, demonstrate that our work can capture underlying heterogeneous patterns in a self-guided way and substantially improve baseline networks by an average of 13.0%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.15365v1-abstract-full').style.display = 'none'; document.getElementById('2412.15365v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> IEEE International Conference on Data Mining, ICDM 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.12510">arXiv:2412.12510</a> <span> [<a href="https://arxiv.org/pdf/2412.12510">pdf</a>, <a href="https://arxiv.org/format/2412.12510">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> Can Large Language Models Understand You Better? An MBTI Personality Detection Dataset Aligned with Population Traits </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+B">Bohan Li</a>, <a href="/search/cs?searchtype=author&query=Guan%2C+J">Jiannan Guan</a>, <a href="/search/cs?searchtype=author&query=Dou%2C+L">Longxu Dou</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Y">Yunlong Feng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+D">Dingzirui Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yang Xu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+E">Enbo Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Q">Qiguang Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bichen Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xiao Xu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yimeng Zhang</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+L">Libo Qin</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yanyan Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Q">Qingfu Zhu</a>, <a href="/search/cs?searchtype=author&query=Che%2C+W">Wanxiang Che</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.12510v1-abstract-short" style="display: inline;"> The Myers-Briggs Type Indicator (MBTI) is one of the most influential personality theories reflecting individual differences in thinking, feeling, and behaving. MBTI personality detection has garnered considerable research interest and has evolved significantly over the years. However, this task tends to be overly optimistic, as it currently does not align well with the natural distribution of pop… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.12510v1-abstract-full').style.display = 'inline'; document.getElementById('2412.12510v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.12510v1-abstract-full" style="display: none;"> The Myers-Briggs Type Indicator (MBTI) is one of the most influential personality theories reflecting individual differences in thinking, feeling, and behaving. MBTI personality detection has garnered considerable research interest and has evolved significantly over the years. However, this task tends to be overly optimistic, as it currently does not align well with the natural distribution of population personality traits. Specifically, (1) the self-reported labels in existing datasets result in incorrect labeling issues, and (2) the hard labels fail to capture the full range of population personality distributions. In this paper, we optimize the task by constructing MBTIBench, the first manually annotated high-quality MBTI personality detection dataset with soft labels, under the guidance of psychologists. As for the first challenge, MBTIBench effectively solves the incorrect labeling issues, which account for 29.58% of the data. As for the second challenge, we estimate soft labels by deriving the polarity tendency of samples. The obtained soft labels confirm that there are more people with non-extreme personality traits. Experimental results not only highlight the polarized predictions and biases in LLMs as key directions for future research, but also confirm that soft labels can provide more benefits to other psychological tasks than hard labels. The code and data are available at https://github.com/Personality-NLP/MbtiBench. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.12510v1-abstract-full').style.display = 'none'; document.getElementById('2412.12510v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by COLING 2025. 28 papges, 20 figures, 10 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.06847">arXiv:2412.06847</a> <span> [<a href="https://arxiv.org/pdf/2412.06847">pdf</a>, <a href="https://arxiv.org/format/2412.06847">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantitative Methods">q-bio.QM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> M$^{3}$-20M: A Large-Scale Multi-Modal Molecule Dataset for AI-driven Drug Design and Discovery </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+S">Siyuan Guo</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Lexuan Wang</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+C">Chang Jin</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jinxian Wang</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+H">Han Peng</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+H">Huayang Shi</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wengen Li</a>, <a href="/search/cs?searchtype=author&query=Guan%2C+J">Jihong Guan</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+S">Shuigeng Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.06847v2-abstract-short" style="display: inline;"> This paper introduces M$^{3}$-20M, a large-scale Multi-Modal Molecule dataset that contains over 20 million molecules, with the data mainly being integrated from existing databases and partially generated by large language models. Designed to support AI-driven drug design and discovery, M$^{3}$-20M is 71 times more in the number of molecules than the largest existing dataset, providing an unpreced… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.06847v2-abstract-full').style.display = 'inline'; document.getElementById('2412.06847v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.06847v2-abstract-full" style="display: none;"> This paper introduces M$^{3}$-20M, a large-scale Multi-Modal Molecule dataset that contains over 20 million molecules, with the data mainly being integrated from existing databases and partially generated by large language models. Designed to support AI-driven drug design and discovery, M$^{3}$-20M is 71 times more in the number of molecules than the largest existing dataset, providing an unprecedented scale that can highly benefit the training or fine-tuning of models, including large language models for drug design and discovery tasks. This dataset integrates one-dimensional SMILES, two-dimensional molecular graphs, three-dimensional molecular structures, physicochemical properties, and textual descriptions collected through web crawling and generated using GPT-3.5, offering a comprehensive view of each molecule. To demonstrate the power of M$^{3}$-20M in drug design and discovery, we conduct extensive experiments on two key tasks: molecule generation and molecular property prediction, using large language models including GLM4, GPT-3.5, GPT-4, and Llama3-8b. Our experimental results show that M$^{3}$-20M can significantly boost model performance in both tasks. Specifically, it enables the models to generate more diverse and valid molecular structures and achieve higher property prediction accuracy than existing single-modal datasets, which validates the value and potential of M$^{3}$-20M in supporting AI-driven drug design and discovery. The dataset is available at https://github.com/bz99bz/M-3. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.06847v2-abstract-full').style.display = 'none'; document.getElementById('2412.06847v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.18463">arXiv:2411.18463</a> <span> [<a href="https://arxiv.org/pdf/2411.18463">pdf</a>, <a href="https://arxiv.org/format/2411.18463">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Biomolecules">q-bio.BM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Hotspot-Driven Peptide Design via Multi-Fragment Autoregressive Extension </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+J">Jiahan Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tong Chen</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+S">Shitong Luo</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+C">Chaoran Cheng</a>, <a href="/search/cs?searchtype=author&query=Guan%2C+J">Jiaqi Guan</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+R">Ruihan Guo</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Sheng Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+G">Ge Liu</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+J">Jian Peng</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+J">Jianzhu Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.18463v2-abstract-short" style="display: inline;"> Peptides, short chains of amino acids, interact with target proteins, making them a unique class of protein-based therapeutics for treating human diseases. Recently, deep generative models have shown great promise in peptide generation. However, several challenges remain in designing effective peptide binders. First, not all residues contribute equally to peptide-target interactions. Second, the g… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18463v2-abstract-full').style.display = 'inline'; document.getElementById('2411.18463v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.18463v2-abstract-full" style="display: none;"> Peptides, short chains of amino acids, interact with target proteins, making them a unique class of protein-based therapeutics for treating human diseases. Recently, deep generative models have shown great promise in peptide generation. However, several challenges remain in designing effective peptide binders. First, not all residues contribute equally to peptide-target interactions. Second, the generated peptides must adopt valid geometries due to the constraints of peptide bonds. Third, realistic tasks for peptide drug development are still lacking. To address these challenges, we introduce PepHAR, a hot-spot-driven autoregressive generative model for designing peptides targeting specific proteins. Building on the observation that certain hot spot residues have higher interaction potentials, we first use an energy-based density model to fit and sample these key residues. Next, to ensure proper peptide geometry, we autoregressively extend peptide fragments by estimating dihedral angles between residue frames. Finally, we apply an optimization process to iteratively refine fragment assembly, ensuring correct peptide structures. By combining hot spot sampling with fragment-based extension, our approach enables de novo peptide design tailored to a target protein and allows the incorporation of key hot spot residues into peptide scaffolds. Extensive experiments, including peptide design and peptide scaffold generation, demonstrate the strong potential of PepHAR in computational peptide binder design. Source code will be available at https://github.com/Ced3-han/PepHAR. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18463v2-abstract-full').style.display = 'none'; document.getElementById('2411.18463v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published as a conference paper at ICLR 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10752">arXiv:2411.10752</a> <span> [<a href="https://arxiv.org/pdf/2411.10752">pdf</a>, <a href="https://arxiv.org/format/2411.10752">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Towards a Comprehensive Benchmark for Pathological Lymph Node Metastasis in Breast Cancer Sections </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ling%2C+X">Xitong Ling</a>, <a href="/search/cs?searchtype=author&query=Lei%2C+Y">Yuanyuan Lei</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jiawen Li</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+J">Junru Cheng</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+W">Wenting Huang</a>, <a href="/search/cs?searchtype=author&query=Guan%2C+T">Tian Guan</a>, <a href="/search/cs?searchtype=author&query=Guan%2C+J">Jian Guan</a>, <a href="/search/cs?searchtype=author&query=He%2C+Y">Yonghong He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10752v1-abstract-short" style="display: inline;"> Advances in optical microscopy scanning have significantly contributed to computational pathology (CPath) by converting traditional histopathological slides into whole slide images (WSIs). This development enables comprehensive digital reviews by pathologists and accelerates AI-driven diagnostic support for WSI analysis. Recent advances in foundational pathology models have increased the need for… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10752v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10752v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10752v1-abstract-full" style="display: none;"> Advances in optical microscopy scanning have significantly contributed to computational pathology (CPath) by converting traditional histopathological slides into whole slide images (WSIs). This development enables comprehensive digital reviews by pathologists and accelerates AI-driven diagnostic support for WSI analysis. Recent advances in foundational pathology models have increased the need for benchmarking tasks. The Camelyon series is one of the most widely used open-source datasets in computational pathology. However, the quality, accessibility, and clinical relevance of the labels have not been comprehensively evaluated. In this study, we reprocessed 1,399 WSIs and labels from the Camelyon-16 and Camelyon-17 datasets, removing low-quality slides, correcting erroneous labels, and providing expert pixel annotations for tumor regions in the previously unreleased test set. Based on the sizes of re-annotated tumor regions, we upgraded the binary cancer screening task to a four-class task: negative, micro-metastasis, macro-metastasis, and Isolated Tumor Cells (ITC). We reevaluated pre-trained pathology feature extractors and multiple instance learning (MIL) methods using the cleaned dataset, providing a benchmark that advances AI development in histopathology. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10752v1-abstract-full').style.display = 'none'; document.getElementById('2411.10752v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09231">arXiv:2411.09231</a> <span> [<a href="https://arxiv.org/pdf/2411.09231">pdf</a>, <a href="https://arxiv.org/format/2411.09231">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> AEAKA: An Adaptive and Efficient Authentication and Key Agreement Scheme for IoT in Cloud-Edge-Device Collaborative Environments </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+K">Kexian Liu</a>, <a href="/search/cs?searchtype=author&query=Guan%2C+J">Jianfeng Guan</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+X">Xiaolong Hu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jing Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jianli Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hongke Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09231v1-abstract-short" style="display: inline;"> To meet the diverse needs of users, the rapid advancement of cloud-edge-device collaboration has become a standard practice. However, this complex environment, particularly in untrusted (non-collaborative) scenarios, presents numerous security challenges. Authentication acts as the first line of defense and is fundamental to addressing these issues. Although many authentication and key agreement s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09231v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09231v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09231v1-abstract-full" style="display: none;"> To meet the diverse needs of users, the rapid advancement of cloud-edge-device collaboration has become a standard practice. However, this complex environment, particularly in untrusted (non-collaborative) scenarios, presents numerous security challenges. Authentication acts as the first line of defense and is fundamental to addressing these issues. Although many authentication and key agreement schemes exist, they often face limitations, such as being tailored to overly specific scenarios where devices authenticate solely with either the edge or the cloud, or being unsuitable for resource-constrained devices. To address these challenges, we propose an adaptive and efficient authentication and key agreement scheme (AEAKA) for Cloud-Edge-Device IoT environments. This scheme is highly adaptive and scalable, capable of automatically and dynamically initiating different authentication methods based on device requirements. Additionally, it employs an edge-assisted authentication approach to reduce the load on third-party trust authorities. Furthermore, we introduce a hash-based algorithm for the authentication protocol, ensuring a lightweight method suitable for a wide range of resource-constrained devices while maintaining security. AEAKA ensures that entities use associated authentication credentials, enhancing the privacy of the authentication process. Security proofs and performance analyses demonstrate that AEAKA outperforms other methods in terms of security and authentication efficiency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09231v1-abstract-full').style.display = 'none'; document.getElementById('2411.09231v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages,14 figures,submitted to Transactions on Dependable and Secure Computing in 30-May-2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09229">arXiv:2411.09229</a> <span> [<a href="https://arxiv.org/pdf/2411.09229">pdf</a>, <a href="https://arxiv.org/format/2411.09229">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Efficient and Secure Cross-Domain Data-Sharing for Resource-Constrained Internet of Things </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+K">Kexian Liu</a>, <a href="/search/cs?searchtype=author&query=Guan%2C+J">Jianfeng Guan</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+X">Xiaolong Hu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jianli Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hongke Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09229v1-abstract-short" style="display: inline;"> The growing complexity of Internet of Things (IoT) environments, particularly in cross-domain data sharing, presents significant security challenges. Existing data-sharing schemes often rely on computationally expensive cryptographic operations and centralized key management, limiting their effectiveness for resource-constrained devices. To address these issues, we propose an efficient, secure blo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09229v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09229v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09229v1-abstract-full" style="display: none;"> The growing complexity of Internet of Things (IoT) environments, particularly in cross-domain data sharing, presents significant security challenges. Existing data-sharing schemes often rely on computationally expensive cryptographic operations and centralized key management, limiting their effectiveness for resource-constrained devices. To address these issues, we propose an efficient, secure blockchain-based data-sharing scheme. First, our scheme adopts a distributed key generation method, which avoids single point of failure. This method also allows independent pseudonym generation and key updates, enhancing authentication flexibility while reducing computational overhead. Additionally, the scheme provides a complete data-sharing process, covering data uploading, storage, and sharing, while ensuring data traceability, integrity, and privacy. Security analysis shows that the proposed scheme is theoretically secure and resistant to various attacks, while performance evaluations demonstrate lower computational and communication overhead compared to existing solutions, making it both secure and efficient for IoT applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09229v1-abstract-full').style.display = 'none'; document.getElementById('2411.09229v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages,10 figures, submitted to Transactions on Information Forensics & Security in 19-Sep-2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03604">arXiv:2411.03604</a> <span> [<a href="https://arxiv.org/pdf/2411.03604">pdf</a>, <a href="https://arxiv.org/format/2411.03604">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> </div> </div> <p class="title is-5 mathjax"> Temporal-Difference Learning Using Distributed Error Signals </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guan%2C+J">Jonas Guan</a>, <a href="/search/cs?searchtype=author&query=Verch%2C+S+E">Shon Eduard Verch</a>, <a href="/search/cs?searchtype=author&query=Voelcker%2C+C">Claas Voelcker</a>, <a href="/search/cs?searchtype=author&query=Jackson%2C+E+C">Ethan C. Jackson</a>, <a href="/search/cs?searchtype=author&query=Papernot%2C+N">Nicolas Papernot</a>, <a href="/search/cs?searchtype=author&query=Cunningham%2C+W+A">William A. Cunningham</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03604v1-abstract-short" style="display: inline;"> A computational problem in biological reward-based learning is how credit assignment is performed in the nucleus accumbens (NAc). Much research suggests that NAc dopamine encodes temporal-difference (TD) errors for learning value predictions. However, dopamine is synchronously distributed in regionally homogeneous concentrations, which does not support explicit credit assignment (like used by back… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03604v1-abstract-full').style.display = 'inline'; document.getElementById('2411.03604v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03604v1-abstract-full" style="display: none;"> A computational problem in biological reward-based learning is how credit assignment is performed in the nucleus accumbens (NAc). Much research suggests that NAc dopamine encodes temporal-difference (TD) errors for learning value predictions. However, dopamine is synchronously distributed in regionally homogeneous concentrations, which does not support explicit credit assignment (like used by backpropagation). It is unclear whether distributed errors alone are sufficient for synapses to make coordinated updates to learn complex, nonlinear reward-based learning tasks. We design a new deep Q-learning algorithm, Artificial Dopamine, to computationally demonstrate that synchronously distributed, per-layer TD errors may be sufficient to learn surprisingly complex RL tasks. We empirically evaluate our algorithm on MinAtar, the DeepMind Control Suite, and classic control tasks, and show it often achieves comparable performance to deep RL algorithms that use backpropagation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03604v1-abstract-full').style.display = 'none'; document.getElementById('2411.03604v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, to be published at NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02951">arXiv:2411.02951</a> <span> [<a href="https://arxiv.org/pdf/2411.02951">pdf</a>, <a href="https://arxiv.org/format/2411.02951">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> LDPM: Towards undersampled MRI reconstruction with MR-VAE and Latent Diffusion Prior </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tang%2C+X">Xingjian Tang</a>, <a href="/search/cs?searchtype=author&query=Guan%2C+J">Jingwei Guan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Linge Li</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+R">Ran Shi</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Youmei Zhang</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+M">Mengye Lyu</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+L">Li Yan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02951v2-abstract-short" style="display: inline;"> Diffusion models, as powerful generative models, have found a wide range of applications and shown great potential in solving image reconstruction problems. Some works attempted to solve MRI reconstruction with diffusion models, but these methods operate directly in pixel space, leading to higher computational costs for optimization and inference. Latent diffusion models, pre-trained on natural im… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02951v2-abstract-full').style.display = 'inline'; document.getElementById('2411.02951v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02951v2-abstract-full" style="display: none;"> Diffusion models, as powerful generative models, have found a wide range of applications and shown great potential in solving image reconstruction problems. Some works attempted to solve MRI reconstruction with diffusion models, but these methods operate directly in pixel space, leading to higher computational costs for optimization and inference. Latent diffusion models, pre-trained on natural images with rich visual priors, are expected to solve the high computational cost problem in MRI reconstruction by operating in a lower-dimensional latent space. However, direct application to MRI reconstruction faces three key challenges: (1) absence of explicit control mechanisms for medical fidelity, (2) domain gap between natural images and MR physics, and (3) undefined data consistency in latent space. To address these challenges, a novel Latent Diffusion Prior-based undersampled MRI reconstruction (LDPM) method is proposed. Our LDPM framework addresses these challenges by: (1) a sketch-guided pipeline with a two-step reconstruction strategy, which balances perceptual quality and anatomical fidelity, (2) an MRI-optimized VAE (MR-VAE), which achieves an improvement of approximately 3.92 dB in PSNR for undersampled MRI reconstruction compared to that with SD-VAE \cite{sd}, and (3) Dual-Stage Sampler, a modified version of spaced DDPM sampler, which enforces high-fidelity reconstruction in the latent space. Experiments on the fastMRI dataset\cite{fastmri} demonstrate the state-of-the-art performance of the proposed method and its robustness across various scenarios. The effectiveness of each module is also verified through ablation experiments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02951v2-abstract-full').style.display = 'none'; document.getElementById('2411.02951v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20688">arXiv:2410.20688</a> <span> [<a href="https://arxiv.org/pdf/2410.20688">pdf</a>, <a href="https://arxiv.org/format/2410.20688">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Biomolecules">q-bio.BM</span> </div> </div> <p class="title is-5 mathjax"> Reprogramming Pretrained Target-Specific Diffusion Models for Dual-Target Drug Design </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+X">Xiangxin Zhou</a>, <a href="/search/cs?searchtype=author&query=Guan%2C+J">Jiaqi Guan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yijia Zhang</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+X">Xingang Peng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Liang Wang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+J">Jianzhu Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20688v2-abstract-short" style="display: inline;"> Dual-target therapeutic strategies have become a compelling approach and attracted significant attention due to various benefits, such as their potential in overcoming drug resistance in cancer therapy. Considering the tremendous success that deep generative models have achieved in structure-based drug design in recent years, we formulate dual-target drug design as a generative task and curate a n… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20688v2-abstract-full').style.display = 'inline'; document.getElementById('2410.20688v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20688v2-abstract-full" style="display: none;"> Dual-target therapeutic strategies have become a compelling approach and attracted significant attention due to various benefits, such as their potential in overcoming drug resistance in cancer therapy. Considering the tremendous success that deep generative models have achieved in structure-based drug design in recent years, we formulate dual-target drug design as a generative task and curate a novel dataset of potential target pairs based on synergistic drug combinations. We propose to design dual-target drugs with diffusion models that are trained on single-target protein-ligand complex pairs. Specifically, we align two pockets in 3D space with protein-ligand binding priors and build two complex graphs with shared ligand nodes for SE(3)-equivariant composed message passing, based on which we derive a composed drift in both 3D and categorical probability space in the generative process. Our algorithm can well transfer the knowledge gained in single-target pretraining to dual-target scenarios in a zero-shot manner. We also repurpose linker design methods as strong baselines for this task. Extensive experiments demonstrate the effectiveness of our method compared with various baselines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20688v2-abstract-full').style.display = 'none'; document.getElementById('2410.20688v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.10696">arXiv:2410.10696</a> <span> [<a href="https://arxiv.org/pdf/2410.10696">pdf</a>, <a href="https://arxiv.org/format/2410.10696">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> TALK-Act: Enhance Textural-Awareness for 2D Speaking Avatar Reenactment with Diffusion Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guan%2C+J">Jiazhi Guan</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Q">Quanwei Yang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+K">Kaisiyuan Wang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+H">Hang Zhou</a>, <a href="/search/cs?searchtype=author&query=He%2C+S">Shengyi He</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zhiliang Xu</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+H">Haocheng Feng</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+E">Errui Ding</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jingdong Wang</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+H">Hongtao Xie</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Youjian Zhao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Ziwei Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.10696v1-abstract-short" style="display: inline;"> Recently, 2D speaking avatars have increasingly participated in everyday scenarios due to the fast development of facial animation techniques. However, most existing works neglect the explicit control of human bodies. In this paper, we propose to drive not only the faces but also the torso and gesture movements of a speaking figure. Inspired by recent advances in diffusion models, we propose the M… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10696v1-abstract-full').style.display = 'inline'; document.getElementById('2410.10696v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.10696v1-abstract-full" style="display: none;"> Recently, 2D speaking avatars have increasingly participated in everyday scenarios due to the fast development of facial animation techniques. However, most existing works neglect the explicit control of human bodies. In this paper, we propose to drive not only the faces but also the torso and gesture movements of a speaking figure. Inspired by recent advances in diffusion models, we propose the Motion-Enhanced Textural-Aware ModeLing for SpeaKing Avatar Reenactment (TALK-Act) framework, which enables high-fidelity avatar reenactment from only short footage of monocular video. Our key idea is to enhance the textural awareness with explicit motion guidance in diffusion modeling. Specifically, we carefully construct 2D and 3D structural information as intermediate guidance. While recent diffusion models adopt a side network for control information injection, they fail to synthesize temporally stable results even with person-specific fine-tuning. We propose a Motion-Enhanced Textural Alignment module to enhance the bond between driving and target signals. Moreover, we build a Memory-based Hand-Recovering module to help with the difficulties in hand-shape preserving. After pre-training, our model can achieve high-fidelity 2D avatar reenactment with only 30 seconds of person-specific data. Extensive experiments demonstrate the effectiveness and superiority of our proposed framework. Resources can be found at https://guanjz20.github.io/projects/TALK-Act. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10696v1-abstract-full').style.display = 'none'; document.getElementById('2410.10696v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to SIGGRAPH Asia 2024 (conference track). Project page: https://guanjz20.github.io/projects/TALK-Act</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.19700">arXiv:2409.19700</a> <span> [<a href="https://arxiv.org/pdf/2409.19700">pdf</a>, <a href="https://arxiv.org/format/2409.19700">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> 2D-TPE: Two-Dimensional Positional Encoding Enhances Table Understanding for Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+J">Jia-Nan Li</a>, <a href="/search/cs?searchtype=author&query=Guan%2C+J">Jian Guan</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+W">Wei Wu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Z">Zhengtao Yu</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+R">Rui Yan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.19700v3-abstract-short" style="display: inline;"> Tables are ubiquitous across various domains for concisely representing structured information. Empowering large language models (LLMs) to reason over tabular data represents an actively explored direction. However, since typical LLMs only support one-dimensional~(1D) inputs, existing methods often flatten the two-dimensional~(2D) table structure into a sequence of tokens, which can severely disru… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19700v3-abstract-full').style.display = 'inline'; document.getElementById('2409.19700v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.19700v3-abstract-full" style="display: none;"> Tables are ubiquitous across various domains for concisely representing structured information. Empowering large language models (LLMs) to reason over tabular data represents an actively explored direction. However, since typical LLMs only support one-dimensional~(1D) inputs, existing methods often flatten the two-dimensional~(2D) table structure into a sequence of tokens, which can severely disrupt the spatial relationships and result in an inevitable loss of vital contextual information. In this paper, we first empirically demonstrate the detrimental impact of such flattening operations on the performance of LLMs in capturing the spatial information of tables through two elaborate proxy tasks. Subsequently, we introduce a simple yet effective positional encoding method, termed ``2D-TPE'' (Two-Dimensional Table Positional Encoding), to address this challenge. 2D-TPE enables each attention head to dynamically select a permutation order of tokens within the context for attending to them, where each permutation represents a distinct traversal mode for the table, such as column-wise or row-wise traversal. 2D-TPE effectively mitigates the risk of losing essential spatial information while preserving computational efficiency, thus better preserving the table structure. Extensive experiments across five benchmarks demonstrate that 2D-TPE outperforms strong baselines, underscoring the importance of preserving the table structure for accurate table comprehension. Comprehensive analysis further reveals the substantially better scalability of 2D-TPE to large tables than baselines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19700v3-abstract-full').style.display = 'none'; document.getElementById('2409.19700v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.14837">arXiv:2409.14837</a> <span> [<a href="https://arxiv.org/pdf/2409.14837">pdf</a>, <a href="https://arxiv.org/format/2409.14837">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> </div> </div> <p class="title is-5 mathjax"> MESC: Re-thinking Algorithmic Priority and/or Criticality Inversions for Heterogeneous MCSs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guan%2C+J">Jiapeng Guan</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+R">Ran Wei</a>, <a href="/search/cs?searchtype=author&query=You%2C+D">Dean You</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yingquan Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+R">Ruizhe Yang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hui Wang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Zhe Jiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.14837v1-abstract-short" style="display: inline;"> Modern Mixed-Criticality Systems (MCSs) rely on hardware heterogeneity to satisfy ever-increasing computational demands. However, most of the heterogeneous co-processors are designed to achieve high throughput, with their micro-architectures executing the workloads in a streaming manner. This streaming execution is often non-preemptive or limited-preemptive, preventing tasks' prioritisation based… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.14837v1-abstract-full').style.display = 'inline'; document.getElementById('2409.14837v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.14837v1-abstract-full" style="display: none;"> Modern Mixed-Criticality Systems (MCSs) rely on hardware heterogeneity to satisfy ever-increasing computational demands. However, most of the heterogeneous co-processors are designed to achieve high throughput, with their micro-architectures executing the workloads in a streaming manner. This streaming execution is often non-preemptive or limited-preemptive, preventing tasks' prioritisation based on their importance and resulting in frequent occurrences of algorithmic priority and/or criticality inversions. Such problems present a significant barrier to guaranteeing the systems' real-time predictability, especially when co-processors dominate the execution of the workloads (e.g., DNNs and transformers). In contrast to existing works that typically enable coarse-grained context switch by splitting the workloads/algorithms, we demonstrate a method that provides fine-grained context switch on a widely used open-source DNN accelerator by enabling instruction-level preemption without any workloads/algorithms modifications. As a systematic solution, we build a real system, i.e., Make Each Switch Count (MESC), from the SoC and ISA to the OS kernel. A theoretical model and analysis are also provided for timing guarantees. Experimental results reveal that, compared to conventional MCSs using non-preemptive DNN accelerators, MESC achieved a 250x and 300x speedup in resolving algorithmic priority and criticality inversions, with less than 5\% overhead. To our knowledge, this is the first work investigating algorithmic priority and criticality inversions for MCSs at the instruction level. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.14837v1-abstract-full').style.display = 'none'; document.getElementById('2409.14837v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at the 2024 IEEE Real-Time Systems Symposium (RTSS)</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> C.3; D.4.7 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.14483">arXiv:2409.14483</a> <span> [<a href="https://arxiv.org/pdf/2409.14483">pdf</a>, <a href="https://arxiv.org/format/2409.14483">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> One Model for Two Tasks: Cooperatively Recognizing and Recovering Low-Resolution Scene Text Images by Iterative Mutual Guidance </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+M">Minyi Zhao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yang Wang</a>, <a href="/search/cs?searchtype=author&query=Guan%2C+J">Jihong Guan</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+S">Shuigeng Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.14483v1-abstract-short" style="display: inline;"> Scene text recognition (STR) from high-resolution (HR) images has been significantly successful, however text reading on low-resolution (LR) images is still challenging due to insufficient visual information. Therefore, recently many scene text image super-resolution (STISR) models have been proposed to generate super-resolution (SR) images for the LR ones, then STR is done on the SR images, which… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.14483v1-abstract-full').style.display = 'inline'; document.getElementById('2409.14483v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.14483v1-abstract-full" style="display: none;"> Scene text recognition (STR) from high-resolution (HR) images has been significantly successful, however text reading on low-resolution (LR) images is still challenging due to insufficient visual information. Therefore, recently many scene text image super-resolution (STISR) models have been proposed to generate super-resolution (SR) images for the LR ones, then STR is done on the SR images, which thus boosts recognition performance. Nevertheless, these methods have two major weaknesses. On the one hand, STISR approaches may generate imperfect or even erroneous SR images, which mislead the subsequent recognition of STR models. On the other hand, as the STISR and STR models are jointly optimized, to pursue high recognition accuracy, the fidelity of SR images may be spoiled. As a result, neither the recognition performance nor the fidelity of STISR models are desirable. Then, can we achieve both high recognition performance and good fidelity? To this end, in this paper we propose a novel method called IMAGE (the abbreviation of Iterative MutuAl GuidancE) to effectively recognize and recover LR scene text images simultaneously. Concretely, IMAGE consists of a specialized STR model for recognition and a tailored STISR model to recover LR images, which are optimized separately. And we develop an iterative mutual guidance mechanism, with which the STR model provides high-level semantic information as clue to the STISR model for better super-resolution, meanwhile the STISR model offers essential low-level pixel clue to the STR model for more accurate recognition. Extensive experiments on two LR datasets demonstrate the superiority of our method over the existing works on both recognition performance and super-resolution fidelity. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.14483v1-abstract-full').style.display = 'none'; document.getElementById('2409.14483v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.12452">arXiv:2409.12452</a> <span> [<a href="https://arxiv.org/pdf/2409.12452">pdf</a>, <a href="https://arxiv.org/format/2409.12452">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Unlocking Reasoning Potential in Large Langauge Models by Scaling Code-form Planning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wen%2C+J">Jiaxin Wen</a>, <a href="/search/cs?searchtype=author&query=Guan%2C+J">Jian Guan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hongning Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+W">Wei Wu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+M">Minlie Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.12452v2-abstract-short" style="display: inline;"> Despite the remarkable success of large language models (LLMs) on traditional natural language processing tasks, their planning ability remains a critical bottleneck in tackling complex multi-step reasoning tasks. Existing approaches mainly rely on prompting or task-specific fine-tuning, often suffering from poor robustness and cross-task generalization. To address the limitation, we introduce Cod… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12452v2-abstract-full').style.display = 'inline'; document.getElementById('2409.12452v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.12452v2-abstract-full" style="display: none;"> Despite the remarkable success of large language models (LLMs) on traditional natural language processing tasks, their planning ability remains a critical bottleneck in tackling complex multi-step reasoning tasks. Existing approaches mainly rely on prompting or task-specific fine-tuning, often suffering from poor robustness and cross-task generalization. To address the limitation, we introduce CodePlan, a scalable framework that empowers LLMs to generate and follow \textit{code-form plans} -- pseudocode that outlines high-level, structured reasoning processes. By leveraging the structured and versatile nature of code, CodePlan effectively captures the rich semantics and control flows inherent to sophisticated reasoning tasks. Importantly, CodePlan allows automatic extraction of code-form plans from massive, wide-ranging text corpora without the need for curated, task-specific datasets. This enables it to scale up efficiently and improve LLM's reasoning capabilities across diverse scenarios. To train CodePlan, we construct a large-scale dataset of 2M examples that integrate code-form plans with standard prompt-response pairs from existing corpora. With minimal computation overhead during both training and inference, CodePlan achieves a 25.1\% relative improvement compared with directly generating responses, averaged across 13 challenging multi-step reasoning benchmarks, spanning mathematical reasoning, symbolic reasoning, instruction-following, multi-hop QA, and decision-making tasks. Further analysis reveals CodePlan's increasing performance gains on more complex reasoning tasks, as well as significant data efficiency thanks to its generalization ability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12452v2-abstract-full').style.display = 'none'; document.getElementById('2409.12452v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.07671">arXiv:2409.07671</a> <span> [<a href="https://arxiv.org/pdf/2409.07671">pdf</a>, <a href="https://arxiv.org/format/2409.07671">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Numerical Analysis">math.NA</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Transformed Physics-Informed Neural Networks for The Convection-Diffusion Equation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guan%2C+J">Jiajing Guan</a>, <a href="/search/cs?searchtype=author&query=Elman%2C+H">Howard Elman</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.07671v1-abstract-short" style="display: inline;"> Singularly perturbed problems are known to have solutions with steep boundary layers that are hard to resolve numerically. Traditional numerical methods, such as Finite Difference Methods (FDMs), require a refined mesh to obtain stable and accurate solutions. As Physics-Informed Neural Networks (PINNs) have been shown to successfully approximate solutions to differential equations from various fie… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.07671v1-abstract-full').style.display = 'inline'; document.getElementById('2409.07671v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.07671v1-abstract-full" style="display: none;"> Singularly perturbed problems are known to have solutions with steep boundary layers that are hard to resolve numerically. Traditional numerical methods, such as Finite Difference Methods (FDMs), require a refined mesh to obtain stable and accurate solutions. As Physics-Informed Neural Networks (PINNs) have been shown to successfully approximate solutions to differential equations from various fields, it is natural to examine their performance on singularly perturbed problems. The convection-diffusion equation is a representative example of such a class of problems, and we consider the use of PINNs to produce numerical solutions of this equation. We study two ways to use PINNS: as a method for correcting oscillatory discrete solutions obtained using FDMs, and as a method for modifying reduced solutions of unperturbed problems. For both methods, we also examine the use of input transformation to enhance accuracy, and we explain the behavior of input transformations analytically, with the help of neural tangent kernels. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.07671v1-abstract-full').style.display = 'none'; document.getElementById('2409.07671v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.03187">arXiv:2409.03187</a> <span> [<a href="https://arxiv.org/pdf/2409.03187">pdf</a>, <a href="https://arxiv.org/ps/2409.03187">ps</a>, <a href="https://arxiv.org/format/2409.03187">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Disordered Systems and Neural Networks">cond-mat.dis-nn</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Dynamical Systems">math.DS</span> </div> </div> <p class="title is-5 mathjax"> How noise affects memory in linear recurrent networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guan%2C+J">JingChuan Guan</a>, <a href="/search/cs?searchtype=author&query=Kubota%2C+T">Tomoyuki Kubota</a>, <a href="/search/cs?searchtype=author&query=Kuniyoshi%2C+Y">Yasuo Kuniyoshi</a>, <a href="/search/cs?searchtype=author&query=Nakajima%2C+K">Kohei Nakajima</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.03187v1-abstract-short" style="display: inline;"> The effects of noise on memory in a linear recurrent network are theoretically investigated. Memory is characterized by its ability to store previous inputs in its instantaneous state of network, which receives a correlated or uncorrelated noise. Two major properties are revealed: First, the memory reduced by noise is uniquely determined by the noise's power spectral density (PSD). Second, the mem… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.03187v1-abstract-full').style.display = 'inline'; document.getElementById('2409.03187v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.03187v1-abstract-full" style="display: none;"> The effects of noise on memory in a linear recurrent network are theoretically investigated. Memory is characterized by its ability to store previous inputs in its instantaneous state of network, which receives a correlated or uncorrelated noise. Two major properties are revealed: First, the memory reduced by noise is uniquely determined by the noise's power spectral density (PSD). Second, the memory will not decrease regardless of noise intensity if the PSD is in a certain class of distribution (including power law). The results are verified using the human brain signals, showing good agreement. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.03187v1-abstract-full').style.display = 'none'; document.getElementById('2409.03187v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.13461">arXiv:2408.13461</a> <span> [<a href="https://arxiv.org/pdf/2408.13461">pdf</a>, <a href="https://arxiv.org/format/2408.13461">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Probing the Robustness of Vision-Language Pretrained Models: A Multimodal Adversarial Attack Approach </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guan%2C+J">Jiwei Guan</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+T">Tianyu Ding</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+L">Longbing Cao</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+L">Lei Pan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chen Wang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+X">Xi Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.13461v1-abstract-short" style="display: inline;"> Vision-language pretraining (VLP) with transformers has demonstrated exceptional performance across numerous multimodal tasks. However, the adversarial robustness of these models has not been thoroughly investigated. Existing multimodal attack methods have largely overlooked cross-modal interactions between visual and textual modalities, particularly in the context of cross-attention mechanisms. I… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.13461v1-abstract-full').style.display = 'inline'; document.getElementById('2408.13461v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.13461v1-abstract-full" style="display: none;"> Vision-language pretraining (VLP) with transformers has demonstrated exceptional performance across numerous multimodal tasks. However, the adversarial robustness of these models has not been thoroughly investigated. Existing multimodal attack methods have largely overlooked cross-modal interactions between visual and textual modalities, particularly in the context of cross-attention mechanisms. In this paper, we study the adversarial vulnerability of recent VLP transformers and design a novel Joint Multimodal Transformer Feature Attack (JMTFA) that concurrently introduces adversarial perturbations in both visual and textual modalities under white-box settings. JMTFA strategically targets attention relevance scores to disrupt important features within each modality, generating adversarial samples by fusing perturbations and leading to erroneous model predictions. Experimental results indicate that the proposed approach achieves high attack success rates on vision-language understanding and reasoning downstream tasks compared to existing baselines. Notably, our findings reveal that the textual modality significantly influences the complex fusion processes within VLP transformers. Moreover, we observe no apparent relationship between model size and adversarial robustness under our proposed attacks. These insights emphasize a new dimension of adversarial robustness and underscore potential risks in the reliable deployment of multimodal AI systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.13461v1-abstract-full').style.display = 'none'; document.getElementById('2408.13461v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.09675">arXiv:2408.09675</a> <span> [<a href="https://arxiv.org/pdf/2408.09675">pdf</a>, <a href="https://arxiv.org/format/2408.09675">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Multi-Agent Reinforcement Learning for Autonomous Driving: A Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Ruiqi Zhang</a>, <a href="/search/cs?searchtype=author&query=Hou%2C+J">Jing Hou</a>, <a href="/search/cs?searchtype=author&query=Walter%2C+F">Florian Walter</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+S">Shangding Gu</a>, <a href="/search/cs?searchtype=author&query=Guan%2C+J">Jiayi Guan</a>, <a href="/search/cs?searchtype=author&query=R%C3%B6hrbein%2C+F">Florian R枚hrbein</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Y">Yali Du</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+P">Panpan Cai</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+G">Guang Chen</a>, <a href="/search/cs?searchtype=author&query=Knoll%2C+A">Alois Knoll</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.09675v1-abstract-short" style="display: inline;"> Reinforcement Learning (RL) is a potent tool for sequential decision-making and has achieved performance surpassing human capabilities across many challenging real-world tasks. As the extension of RL in the multi-agent system domain, multi-agent RL (MARL) not only need to learn the control policy but also requires consideration regarding interactions with all other agents in the environment, mutua… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.09675v1-abstract-full').style.display = 'inline'; document.getElementById('2408.09675v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.09675v1-abstract-full" style="display: none;"> Reinforcement Learning (RL) is a potent tool for sequential decision-making and has achieved performance surpassing human capabilities across many challenging real-world tasks. As the extension of RL in the multi-agent system domain, multi-agent RL (MARL) not only need to learn the control policy but also requires consideration regarding interactions with all other agents in the environment, mutual influences among different system components, and the distribution of computational resources. This augments the complexity of algorithmic design and poses higher requirements on computational resources. Simultaneously, simulators are crucial to obtain realistic data, which is the fundamentals of RL. In this paper, we first propose a series of metrics of simulators and summarize the features of existing benchmarks. Second, to ease comprehension, we recall the foundational knowledge and then synthesize the recently advanced studies of MARL-related autonomous driving and intelligent transportation systems. Specifically, we examine their environmental modeling, state representation, perception units, and algorithm design. Conclusively, we discuss open challenges as well as prospects and opportunities. We hope this paper can help the researchers integrate MARL technologies and trigger more insightful ideas toward the intelligent and autonomous driving. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.09675v1-abstract-full').style.display = 'none'; document.getElementById('2408.09675v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">23 pages, 6 figures and 2 tables. Submitted to IEEE Journal</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.06402">arXiv:2408.06402</a> <span> [<a href="https://arxiv.org/pdf/2408.06402">pdf</a>, <a href="https://arxiv.org/format/2408.06402">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantitative Methods">q-bio.QM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> PhaGO: Protein function annotation for bacteriophages by integrating the genomic context </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guan%2C+J">Jiaojiao Guan</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+Y">Yongxin Ji</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+C">Cheng Peng</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+W">Wei Zou</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+X">Xubo Tang</a>, <a href="/search/cs?searchtype=author&query=Shang%2C+J">Jiayu Shang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yanni Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.06402v2-abstract-short" style="display: inline;"> Bacteriophages are viruses that target bacteria, playing a crucial role in microbial ecology. Phage proteins are important in understanding phage biology, such as virus infection, replication, and evolution. Although a large number of new phages have been identified via metagenomic sequencing, many of them have limited protein function annotation. Accurate function annotation of phage proteins pre… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.06402v2-abstract-full').style.display = 'inline'; document.getElementById('2408.06402v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.06402v2-abstract-full" style="display: none;"> Bacteriophages are viruses that target bacteria, playing a crucial role in microbial ecology. Phage proteins are important in understanding phage biology, such as virus infection, replication, and evolution. Although a large number of new phages have been identified via metagenomic sequencing, many of them have limited protein function annotation. Accurate function annotation of phage proteins presents several challenges, including their inherent diversity and the scarcity of annotated ones. Existing tools have yet to fully leverage the unique properties of phages in annotating protein functions. In this work, we propose a new protein function annotation tool for phages by leveraging the modular genomic structure of phage genomes. By employing embeddings from the latest protein foundation models and Transformer to capture contextual information between proteins in phage genomes, PhaGO surpasses state-of-the-art methods in annotating diverged proteins and proteins with uncommon functions by 6.78% and 13.05% improvement, respectively. PhaGO can annotate proteins lacking homology search results, which is critical for characterizing the rapidly accumulating phage genomes. We demonstrate the utility of PhaGO by identifying 688 potential holins in phages, which exhibit high structural conservation with known holins. The results show the potential of PhaGO to extend our understanding of newly discovered phages. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.06402v2-abstract-full').style.display = 'none'; document.getElementById('2408.06402v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages,6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.05914">arXiv:2408.05914</a> <span> [<a href="https://arxiv.org/pdf/2408.05914">pdf</a>, <a href="https://arxiv.org/format/2408.05914">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Deep Multimodal Collaborative Learning for Polyp Re-Identification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiang%2C+S">Suncheng Xiang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jincheng Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhengjie Zhang</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+S">Shilun Cai</a>, <a href="/search/cs?searchtype=author&query=Guan%2C+J">Jiale Guan</a>, <a href="/search/cs?searchtype=author&query=Qian%2C+D">Dahong Qian</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.05914v2-abstract-short" style="display: inline;"> Colonoscopic Polyp Re-Identification aims to match the same polyp from a large gallery with images from different views taken using different cameras, which plays an important role in the prevention and treatment of colorectal cancer in computer-aided diagnosis. However, traditional methods for object ReID directly adopting CNN models trained on the ImageNet dataset usually produce unsatisfactory… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.05914v2-abstract-full').style.display = 'inline'; document.getElementById('2408.05914v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.05914v2-abstract-full" style="display: none;"> Colonoscopic Polyp Re-Identification aims to match the same polyp from a large gallery with images from different views taken using different cameras, which plays an important role in the prevention and treatment of colorectal cancer in computer-aided diagnosis. However, traditional methods for object ReID directly adopting CNN models trained on the ImageNet dataset usually produce unsatisfactory retrieval performance on colonoscopic datasets due to the large domain gap. Worsely, these solutions typically learn unimodal modal representations on the basis of visual samples, which fails to explore complementary information from other different modalities. To address this challenge, we propose a novel Deep Multimodal Collaborative Learning framework named DMCL for polyp re-identification, which can effectively encourage modality collaboration and reinforce generalization capability in medical scenarios. On the basis of it, a dynamic multimodal feature fusion strategy is introduced to leverage the optimized multimodal representations for multimodal fusion via end-to-end training. Experiments on the standard benchmarks show the benefits of the multimodal setting over state-of-the-art unimodal ReID models, especially when combined with the specialized multimodal fusion strategy, from which we have proved that learning representation with multiple-modality can be competitive to methods based on unimodal representation learning. We also hope that our method will shed light on some related researches to move forward, especially for multimodal collaborative learning. The code is publicly available at https://github.com/JeremyXSC/DMCL. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.05914v2-abstract-full').style.display = 'none'; document.getElementById('2408.05914v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.03284">arXiv:2408.03284</a> <span> [<a href="https://arxiv.org/pdf/2408.03284">pdf</a>, <a href="https://arxiv.org/format/2408.03284">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> ReSyncer: Rewiring Style-based Generator for Unified Audio-Visually Synced Facial Performer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guan%2C+J">Jiazhi Guan</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zhiliang Xu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+H">Hang Zhou</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+K">Kaisiyuan Wang</a>, <a href="/search/cs?searchtype=author&query=He%2C+S">Shengyi He</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhanwang Zhang</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+B">Borong Liang</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+H">Haocheng Feng</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+E">Errui Ding</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jingtuo Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jingdong Wang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Youjian Zhao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Ziwei Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.03284v1-abstract-short" style="display: inline;"> Lip-syncing videos with given audio is the foundation for various applications including the creation of virtual presenters or performers. While recent studies explore high-fidelity lip-sync with different techniques, their task-orientated models either require long-term videos for clip-specific training or retain visible artifacts. In this paper, we propose a unified and effective framework ReSyn… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.03284v1-abstract-full').style.display = 'inline'; document.getElementById('2408.03284v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.03284v1-abstract-full" style="display: none;"> Lip-syncing videos with given audio is the foundation for various applications including the creation of virtual presenters or performers. While recent studies explore high-fidelity lip-sync with different techniques, their task-orientated models either require long-term videos for clip-specific training or retain visible artifacts. In this paper, we propose a unified and effective framework ReSyncer, that synchronizes generalized audio-visual facial information. The key design is revisiting and rewiring the Style-based generator to efficiently adopt 3D facial dynamics predicted by a principled style-injected Transformer. By simply re-configuring the information insertion mechanisms within the noise and style space, our framework fuses motion and appearance with unified training. Extensive experiments demonstrate that ReSyncer not only produces high-fidelity lip-synced videos according to audio, but also supports multiple appealing properties that are suitable for creating virtual presenters and performers, including fast personalized fine-tuning, video-driven lip-syncing, the transfer of speaking styles, and even face swapping. Resources can be found at https://guanjz20.github.io/projects/ReSyncer. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.03284v1-abstract-full').style.display = 'none'; document.getElementById('2408.03284v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to European Conference on Computer Vision (ECCV), 2024. Project page: https://guanjz20.github.io/projects/ReSyncer</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.06677">arXiv:2407.06677</a> <span> [<a href="https://arxiv.org/pdf/2407.06677">pdf</a>, <a href="https://arxiv.org/format/2407.06677">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Mixture-of-Modules: Reinventing Transformers as Dynamic Assemblies of Modules </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gong%2C+Z">Zhuocheng Gong</a>, <a href="/search/cs?searchtype=author&query=Lv%2C+A">Ang Lv</a>, <a href="/search/cs?searchtype=author&query=Guan%2C+J">Jian Guan</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+J">Junxi Yan</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+W">Wei Wu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Huishuai Zhang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+M">Minlie Huang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+D">Dongyan Zhao</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+R">Rui Yan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.06677v1-abstract-short" style="display: inline;"> Is it always necessary to compute tokens from shallow to deep layers in Transformers? The continued success of vanilla Transformers and their variants suggests an undoubted "yes". In this work, however, we attempt to break the depth-ordered convention by proposing a novel architecture dubbed mixture-of-modules (MoM), which is motivated by an intuition that any layer, regardless of its position, ca… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.06677v1-abstract-full').style.display = 'inline'; document.getElementById('2407.06677v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.06677v1-abstract-full" style="display: none;"> Is it always necessary to compute tokens from shallow to deep layers in Transformers? The continued success of vanilla Transformers and their variants suggests an undoubted "yes". In this work, however, we attempt to break the depth-ordered convention by proposing a novel architecture dubbed mixture-of-modules (MoM), which is motivated by an intuition that any layer, regardless of its position, can be used to compute a token as long as it possesses the needed processing capabilities. The construction of MoM starts from a finite set of modules defined by multi-head attention and feed-forward networks, each distinguished by its unique parameterization. Two routers then iteratively select attention modules and feed-forward modules from the set to process a token. The selection dynamically expands the computation graph in the forward pass of the token, culminating in an assembly of modules. We show that MoM provides not only a unified framework for Transformers and their numerous variants but also a flexible and learnable approach for reducing redundancy in Transformer parameterization. We pre-train various MoMs using OpenWebText. Empirical results demonstrate that MoMs, of different parameter counts, consistently outperform vanilla transformers on both GLUE and XSUM benchmarks. More interestingly, with a fixed parameter budget, MoM-large enables an over 38% increase in depth for computation graphs compared to GPT-2-large, resulting in absolute gains of 1.4 on GLUE and 1 on XSUM. On the other hand, MoM-large also enables an over 60% reduction in depth while involving more modules per layer, yielding a 16% reduction in TFLOPs and a 43% decrease in memory usage compared to GPT-2-large, while maintaining comparable performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.06677v1-abstract-full').style.display = 'none'; document.getElementById('2407.06677v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.04936">arXiv:2407.04936</a> <span> [<a href="https://arxiv.org/pdf/2407.04936">pdf</a>, <a href="https://arxiv.org/format/2407.04936">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> A Reference-free Metric for Language-Queried Audio Source Separation using Contrastive Language-Audio Pretraining </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiao%2C+F">Feiyang Xiao</a>, <a href="/search/cs?searchtype=author&query=Guan%2C+J">Jian Guan</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Q">Qiaoxi Zhu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xubo Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wenbo Wang</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+S">Shuhan Qi</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kejia Zhang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+J">Jianyuan Sun</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wenwu Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.04936v2-abstract-short" style="display: inline;"> Language-queried audio source separation (LASS) aims to separate an audio source guided by a text query, with the signal-to-distortion ratio (SDR)-based metrics being commonly used to objectively measure the quality of the separated audio. However, the SDR-based metrics require a reference signal, which is often difficult to obtain in real-world scenarios. In addition, with the SDR-based metrics,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.04936v2-abstract-full').style.display = 'inline'; document.getElementById('2407.04936v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.04936v2-abstract-full" style="display: none;"> Language-queried audio source separation (LASS) aims to separate an audio source guided by a text query, with the signal-to-distortion ratio (SDR)-based metrics being commonly used to objectively measure the quality of the separated audio. However, the SDR-based metrics require a reference signal, which is often difficult to obtain in real-world scenarios. In addition, with the SDR-based metrics, the content information of the text query is not considered effectively in LASS. This paper introduces a reference-free evaluation metric using a contrastive language-audio pretraining (CLAP) module, termed CLAPScore, which measures the semantic similarity between the separated audio and the text query. Unlike SDR, the proposed CLAPScore metric evaluates the quality of the separated audio based on the content information of the text query, without needing a reference signal. Experiments show that the CLAPScore provides an effective evaluation of the semantic relevance of the separated audio to the text query, as compared to the SDR metric, offering an alternative for the performance evaluation of LASS systems. The code for evaluation is publicly available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.04936v2-abstract-full').style.display = 'none'; document.getElementById('2407.04936v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by DCASE 2024 Workshop. GitHub: https://github.com/LittleFlyingSheep/CLAPScore_for_LASS</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.19934">arXiv:2406.19934</a> <span> [<a href="https://arxiv.org/pdf/2406.19934">pdf</a>, <a href="https://arxiv.org/format/2406.19934">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> From the Least to the Most: Building a Plug-and-Play Visual Reasoner via Data Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cheng%2C+C">Chuanqi Cheng</a>, <a href="/search/cs?searchtype=author&query=Guan%2C+J">Jian Guan</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+W">Wei Wu</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+R">Rui Yan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.19934v2-abstract-short" style="display: inline;"> We explore multi-step reasoning in vision-language models (VLMs). The problem is challenging, as reasoning data consisting of multiple steps of visual and language processing are barely available. To overcome the challenge, we first introduce a least-to-most visual reasoning paradigm, which interleaves steps of decomposing a question into sub-questions and invoking external tools for resolving sub… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.19934v2-abstract-full').style.display = 'inline'; document.getElementById('2406.19934v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.19934v2-abstract-full" style="display: none;"> We explore multi-step reasoning in vision-language models (VLMs). The problem is challenging, as reasoning data consisting of multiple steps of visual and language processing are barely available. To overcome the challenge, we first introduce a least-to-most visual reasoning paradigm, which interleaves steps of decomposing a question into sub-questions and invoking external tools for resolving sub-questions. Based on the paradigm, we further propose a novel data synthesis approach that can automatically create questions and multi-step reasoning paths for an image in a bottom-up manner. Our approach divides the complex synthesis task into a few simple sub-tasks, and (almost entirely) relies on open-sourced models to accomplish the sub-tasks. Therefore, the entire synthesis process is reproducible and cost-efficient, and the synthesized data is quality guaranteed. With the approach, we construct $50$k visual reasoning examples. Then, we develop a visual reasoner through supervised fine-tuning, which is capable of generally enhancing the reasoning abilities of a wide range of existing VLMs in a plug-and-play fashion. Extensive experiments indicate that the visual reasoner can consistently and significantly improve four VLMs on four VQA benchmarks. Our code and dataset are available at https://github.com/steven-ccq/VisualReasoner. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.19934v2-abstract-full').style.display = 'none'; document.getElementById('2406.19934v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by EMNLP 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.16708">arXiv:2406.16708</a> <span> [<a href="https://arxiv.org/pdf/2406.16708">pdf</a>, <a href="https://arxiv.org/format/2406.16708">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Methodology">stat.ME</span> </div> </div> <p class="title is-5 mathjax"> CausalFormer: An Interpretable Transformer for Temporal Causal Discovery </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kong%2C+L">Lingbai Kong</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wengen Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+H">Hanchen Yang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yichao Zhang</a>, <a href="/search/cs?searchtype=author&query=Guan%2C+J">Jihong Guan</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+S">Shuigeng Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.16708v1-abstract-short" style="display: inline;"> Temporal causal discovery is a crucial task aimed at uncovering the causal relations within time series data. The latest temporal causal discovery methods usually train deep learning models on prediction tasks to uncover the causality between time series. They capture causal relations by analyzing the parameters of some components of the trained models, e.g., attention weights and convolution weig… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.16708v1-abstract-full').style.display = 'inline'; document.getElementById('2406.16708v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.16708v1-abstract-full" style="display: none;"> Temporal causal discovery is a crucial task aimed at uncovering the causal relations within time series data. The latest temporal causal discovery methods usually train deep learning models on prediction tasks to uncover the causality between time series. They capture causal relations by analyzing the parameters of some components of the trained models, e.g., attention weights and convolution weights. However, this is an incomplete mapping process from the model parameters to the causality and fails to investigate the other components, e.g., fully connected layers and activation functions, that are also significant for causal discovery. To facilitate the utilization of the whole deep learning models in temporal causal discovery, we proposed an interpretable transformer-based causal discovery model termed CausalFormer, which consists of the causality-aware transformer and the decomposition-based causality detector. The causality-aware transformer learns the causal representation of time series data using a prediction task with the designed multi-kernel causal convolution which aggregates each input time series along the temporal dimension under the temporal priority constraint. Then, the decomposition-based causality detector interprets the global structure of the trained causality-aware transformer with the proposed regression relevance propagation to identify potential causal relations and finally construct the causal graph. Experiments on synthetic, simulated, and real datasets demonstrate the state-of-the-art performance of CausalFormer on discovering temporal causality. Our code is available at https://github.com/lingbai-kong/CausalFormer. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.16708v1-abstract-full').style.display = 'none'; document.getElementById('2406.16708v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.18729">arXiv:2405.18729</a> <span> [<a href="https://arxiv.org/pdf/2405.18729">pdf</a>, <a href="https://arxiv.org/format/2405.18729">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Preferred-Action-Optimized Diffusion Policies for Offline Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tianle Zhang</a>, <a href="/search/cs?searchtype=author&query=Guan%2C+J">Jiayi Guan</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+L">Lin Zhao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yihang Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Dongjiang Li</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+Z">Zecui Zeng</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+L">Lei Sun</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yue Chen</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+X">Xuelong Wei</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Lusong Li</a>, <a href="/search/cs?searchtype=author&query=He%2C+X">Xiaodong He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.18729v1-abstract-short" style="display: inline;"> Offline reinforcement learning (RL) aims to learn optimal policies from previously collected datasets. Recently, due to their powerful representational capabilities, diffusion models have shown significant potential as policy models for offline RL issues. However, previous offline RL algorithms based on diffusion policies generally adopt weighted regression to improve the policy. This approach opt… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.18729v1-abstract-full').style.display = 'inline'; document.getElementById('2405.18729v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.18729v1-abstract-full" style="display: none;"> Offline reinforcement learning (RL) aims to learn optimal policies from previously collected datasets. Recently, due to their powerful representational capabilities, diffusion models have shown significant potential as policy models for offline RL issues. However, previous offline RL algorithms based on diffusion policies generally adopt weighted regression to improve the policy. This approach optimizes the policy only using the collected actions and is sensitive to Q-values, which limits the potential for further performance enhancement. To this end, we propose a novel preferred-action-optimized diffusion policy for offline RL. In particular, an expressive conditional diffusion model is utilized to represent the diverse distribution of a behavior policy. Meanwhile, based on the diffusion model, preferred actions within the same behavior distribution are automatically generated through the critic function. Moreover, an anti-noise preference optimization is designed to achieve policy improvement by using the preferred actions, which can adapt to noise-preferred actions for stable training. Extensive experiments demonstrate that the proposed method provides competitive or superior performance compared to previous state-of-the-art offline RL methods, particularly in sparse reward tasks such as Kitchen and AntMaze. Additionally, we empirically prove the effectiveness of anti-noise preference optimization. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.18729v1-abstract-full').style.display = 'none'; document.getElementById('2405.18729v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.15769">arXiv:2405.15769</a> <span> [<a href="https://arxiv.org/pdf/2405.15769">pdf</a>, <a href="https://arxiv.org/format/2405.15769">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> FastDrag: Manipulate Anything in One Step </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+X">Xuanjia Zhao</a>, <a href="/search/cs?searchtype=author&query=Guan%2C+J">Jian Guan</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+C">Congyi Fan</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+D">Dongli Xu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Youtian Lin</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+H">Haiwei Pan</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+P">Pengming Feng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.15769v3-abstract-short" style="display: inline;"> Drag-based image editing using generative models provides precise control over image contents, enabling users to manipulate anything in an image with a few clicks. However, prevailing methods typically adopt $n$-step iterations for latent semantic optimization to achieve drag-based image editing, which is time-consuming and limits practical applications. In this paper, we introduce a novel one-ste… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.15769v3-abstract-full').style.display = 'inline'; document.getElementById('2405.15769v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.15769v3-abstract-full" style="display: none;"> Drag-based image editing using generative models provides precise control over image contents, enabling users to manipulate anything in an image with a few clicks. However, prevailing methods typically adopt $n$-step iterations for latent semantic optimization to achieve drag-based image editing, which is time-consuming and limits practical applications. In this paper, we introduce a novel one-step drag-based image editing method, i.e., FastDrag, to accelerate the editing process. Central to our approach is a latent warpage function (LWF), which simulates the behavior of a stretched material to adjust the location of individual pixels within the latent space. This innovation achieves one-step latent semantic optimization and hence significantly promotes editing speeds. Meanwhile, null regions emerging after applying LWF are addressed by our proposed bilateral nearest neighbor interpolation (BNNI) strategy. This strategy interpolates these regions using similar features from neighboring areas, thus enhancing semantic integrity. Additionally, a consistency-preserving strategy is introduced to maintain the consistency between the edited and original images by adopting semantic information from the original image, saved as key and value pairs in self-attention module during diffusion inversion, to guide the diffusion sampling. Our FastDrag is validated on the DragBench dataset, demonstrating substantial improvements in processing time over existing methods, while achieving enhanced editing performance. Project page: https://fastdrag-site.github.io/ . <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.15769v3-abstract-full').style.display = 'none'; document.getElementById('2405.15769v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2024 Accept, Project Page: https://fastdrag-site.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.13528">arXiv:2404.13528</a> <span> [<a href="https://arxiv.org/pdf/2404.13528">pdf</a>, <a href="https://arxiv.org/format/2404.13528">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3620666.3651384">10.1145/3620666.3651384 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> SmartMem: Layout Transformation Elimination and Adaptation for Efficient DNN Execution on Mobile </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Niu%2C+W">Wei Niu</a>, <a href="/search/cs?searchtype=author&query=Sanim%2C+M+M+R">Md Musfiqur Rahman Sanim</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+Z">Zhihao Shu</a>, <a href="/search/cs?searchtype=author&query=Guan%2C+J">Jiexiong Guan</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+X">Xipeng Shen</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+M">Miao Yin</a>, <a href="/search/cs?searchtype=author&query=Agrawal%2C+G">Gagan Agrawal</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+B">Bin Ren</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.13528v1-abstract-short" style="display: inline;"> This work is motivated by recent developments in Deep Neural Networks, particularly the Transformer architectures underlying applications such as ChatGPT, and the need for performing inference on mobile devices. Focusing on emerging transformers (specifically the ones with computationally efficient Swin-like architectures) and large models (e.g., Stable Diffusion and LLMs) based on transformers, w… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.13528v1-abstract-full').style.display = 'inline'; document.getElementById('2404.13528v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.13528v1-abstract-full" style="display: none;"> This work is motivated by recent developments in Deep Neural Networks, particularly the Transformer architectures underlying applications such as ChatGPT, and the need for performing inference on mobile devices. Focusing on emerging transformers (specifically the ones with computationally efficient Swin-like architectures) and large models (e.g., Stable Diffusion and LLMs) based on transformers, we observe that layout transformations between the computational operators cause a significant slowdown in these applications. This paper presents SmartMem, a comprehensive framework for eliminating most layout transformations, with the idea that multiple operators can use the same tensor layout through careful choice of layout and implementation of operations. Our approach is based on classifying the operators into four groups, and considering combinations of producer-consumer edges between the operators. We develop a set of methods for searching such layouts. Another component of our work is developing efficient memory layouts for 2.5 dimensional memory commonly seen in mobile devices. Our experimental results show that SmartMem outperforms 5 state-of-the-art DNN execution frameworks on mobile devices across 18 varied neural networks, including CNNs, Transformers with both local and global attention, as well as LLMs. In particular, compared to DNNFusion, SmartMem achieves an average speedup of 2.8$\times$, and outperforms TVM and MNN with speedups of 6.9$\times$ and 7.9$\times$, respectively, on average. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.13528v1-abstract-full').style.display = 'none'; document.getElementById('2404.13528v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Guan%2C+J&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Guan%2C+J&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Guan%2C+J&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Guan%2C+J&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Guan%2C+J&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository