Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 2,218 results for author: <span class="mathjax">Wang, Q</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Wang%2C+Q">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Wang, Q"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Wang%2C+Q&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Wang, Q"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Wang%2C+Q&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Wang%2C+Q&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Wang%2C+Q&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Wang%2C+Q&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Wang%2C+Q&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Wang%2C+Q&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.14739">arXiv:2502.14739</a> <span> [<a href="https://arxiv.org/pdf/2502.14739">pdf</a>, <a href="https://arxiv.org/format/2502.14739">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> SuperGPQA: Scaling LLM Evaluation across 285 Graduate Disciplines </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Team%2C+M">M-A-P Team</a>, <a href="/search/cs?searchtype=author&query=Du%2C+X">Xinrun Du</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+Y">Yifan Yao</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+K">Kaijing Ma</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bingli Wang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+T">Tianyu Zheng</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+K">Kang Zhu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+M">Minghao Liu</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+Y">Yiming Liang</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+X">Xiaolong Jin</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+Z">Zhenlin Wei</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+C">Chujie Zheng</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+K">Kaixing Deng</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+S">Shuyue Guo</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+S">Shian Jia</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+S">Sichao Jiang</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+Y">Yiyan Liao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+R">Rui Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Q">Qinrui Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Sirun Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yizhi Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yunwen Li</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+D">Dehua Ma</a>, <a href="/search/cs?searchtype=author&query=Ni%2C+Y">Yuansheng Ni</a>, <a href="/search/cs?searchtype=author&query=Que%2C+H">Haoran Que</a> , et al. (70 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.14739v1-abstract-short" style="display: inline;"> Large language models (LLMs) have demonstrated remarkable proficiency in mainstream academic disciplines such as mathematics, physics, and computer science. However, human knowledge encompasses over 200 specialized disciplines, far exceeding the scope of existing benchmarks. The capabilities of LLMs in many of these specialized fields-particularly in light industry, agriculture, and service-orient… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14739v1-abstract-full').style.display = 'inline'; document.getElementById('2502.14739v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.14739v1-abstract-full" style="display: none;"> Large language models (LLMs) have demonstrated remarkable proficiency in mainstream academic disciplines such as mathematics, physics, and computer science. However, human knowledge encompasses over 200 specialized disciplines, far exceeding the scope of existing benchmarks. The capabilities of LLMs in many of these specialized fields-particularly in light industry, agriculture, and service-oriented disciplines-remain inadequately evaluated. To address this gap, we present SuperGPQA, a comprehensive benchmark that evaluates graduate-level knowledge and reasoning capabilities across 285 disciplines. Our benchmark employs a novel Human-LLM collaborative filtering mechanism to eliminate trivial or ambiguous questions through iterative refinement based on both LLM responses and expert feedback. Our experimental results reveal significant room for improvement in the performance of current state-of-the-art LLMs across diverse knowledge domains (e.g., the reasoning-focused model DeepSeek-R1 achieved the highest accuracy of 61.82% on SuperGPQA), highlighting the considerable gap between current model capabilities and artificial general intelligence. Additionally, we present comprehensive insights from our management of a large-scale annotation process, involving over 80 expert annotators and an interactive Human-LLM collaborative system, offering valuable methodological guidance for future research initiatives of comparable scope. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14739v1-abstract-full').style.display = 'none'; document.getElementById('2502.14739v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.13995">arXiv:2502.13995</a> <span> [<a href="https://arxiv.org/pdf/2502.13995">pdf</a>, <a href="https://arxiv.org/format/2502.13995">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> FantasyID: Face Knowledge Enhanced ID-Preserving Video Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yunpeng Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qiang Wang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+F">Fan Jiang</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+Y">Yaqi Fan</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+M">Mu Xu</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+Y">Yonggang Qi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.13995v1-abstract-short" style="display: inline;"> Tuning-free approaches adapting large-scale pre-trained video diffusion models for identity-preserving text-to-video generation (IPT2V) have gained popularity recently due to their efficacy and scalability. However, significant challenges remain to achieve satisfied facial dynamics while keeping the identity unchanged. In this work, we present a novel tuning-free IPT2V framework by enhancing face… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13995v1-abstract-full').style.display = 'inline'; document.getElementById('2502.13995v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.13995v1-abstract-full" style="display: none;"> Tuning-free approaches adapting large-scale pre-trained video diffusion models for identity-preserving text-to-video generation (IPT2V) have gained popularity recently due to their efficacy and scalability. However, significant challenges remain to achieve satisfied facial dynamics while keeping the identity unchanged. In this work, we present a novel tuning-free IPT2V framework by enhancing face knowledge of the pre-trained video model built on diffusion transformers (DiT), dubbed FantasyID. Essentially, 3D facial geometry prior is incorporated to ensure plausible facial structures during video synthesis. To prevent the model from learning copy-paste shortcuts that simply replicate reference face across frames, a multi-view face augmentation strategy is devised to capture diverse 2D facial appearance features, hence increasing the dynamics over the facial expressions and head poses. Additionally, after blending the 2D and 3D features as guidance, instead of naively employing cross-attention to inject guidance cues into DiT layers, a learnable layer-aware adaptive mechanism is employed to selectively inject the fused features into each individual DiT layers, facilitating balanced modeling of identity preservation and motion dynamics. Experimental results validate our model's superiority over the current tuning-free IPT2V methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13995v1-abstract-full').style.display = 'none'; document.getElementById('2502.13995v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.13859">arXiv:2502.13859</a> <span> [<a href="https://arxiv.org/pdf/2502.13859">pdf</a>, <a href="https://arxiv.org/format/2502.13859">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MSVCOD:A Large-Scale Multi-Scene Dataset for Video Camouflage Object Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gao%2C+S">Shuyong Gao</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Y">Yu'ang Feng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qishan Wang</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+L">Lingyi Hong</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+X">Xinyu Zhou</a>, <a href="/search/cs?searchtype=author&query=Fei%2C+L">Liu Fei</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yan Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wenqiang Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.13859v1-abstract-short" style="display: inline;"> Video Camouflaged Object Detection (VCOD) is a challenging task which aims to identify objects that seamlessly concealed within the background in videos. The dynamic properties of video enable detection of camouflaged objects through motion cues or varied perspectives. Previous VCOD datasets primarily contain animal objects, limiting the scope of research to wildlife scenarios. However, the applic… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13859v1-abstract-full').style.display = 'inline'; document.getElementById('2502.13859v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.13859v1-abstract-full" style="display: none;"> Video Camouflaged Object Detection (VCOD) is a challenging task which aims to identify objects that seamlessly concealed within the background in videos. The dynamic properties of video enable detection of camouflaged objects through motion cues or varied perspectives. Previous VCOD datasets primarily contain animal objects, limiting the scope of research to wildlife scenarios. However, the applications of VCOD extend beyond wildlife and have significant implications in security, art, and medical fields. Addressing this problem, we construct a new large-scale multi-domain VCOD dataset MSVCOD. To achieve high-quality annotations, we design a semi-automatic iterative annotation pipeline that reduces costs while maintaining annotation accuracy. Our MSVCOD is the largest VCOD dataset to date, introducing multiple object categories including human, animal, medical, and vehicle objects for the first time, while also expanding background diversity across various environments. This expanded scope increases the practical applicability of the VCOD task in camouflaged object detection. Alongside this dataset, we introduce a one-steam video camouflage object detection model that performs both feature extraction and information fusion without additional motion feature fusion modules. Our framework achieves state-of-the-art results on the existing VCOD animal dataset and the proposed MSVCOD. The dataset and code will be made publicly available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13859v1-abstract-full').style.display = 'none'; document.getElementById('2502.13859v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.13427">arXiv:2502.13427</a> <span> [<a href="https://arxiv.org/pdf/2502.13427">pdf</a>, <a href="https://arxiv.org/ps/2502.13427">ps</a>, <a href="https://arxiv.org/format/2502.13427">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantum Physics">quant-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computational Complexity">cs.CC</span> </div> </div> <p class="title is-5 mathjax"> Does there exist a quantum fingerprinting protocol without coherent measurements? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hasegawa%2C+A">Atsuya Hasegawa</a>, <a href="/search/cs?searchtype=author&query=Kundu%2C+S">Srijita Kundu</a>, <a href="/search/cs?searchtype=author&query=Gall%2C+F+L">Fran莽ois Le Gall</a>, <a href="/search/cs?searchtype=author&query=Nishimura%2C+H">Harumichi Nishimura</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qisheng Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.13427v1-abstract-short" style="display: inline;"> Buhrman, Cleve, Watrous, and de Wolf (PRL 2001) discovered the quantum fingerprinting protocol, which is the quantum SMP protocol with $O(\log n)$ qubits communication for the equality problem. In the protocol, Alice and Bob create some quantum fingerprints of their inputs, and the referee conducts the SWAP tests for the quantum fingerprints. Since $惟(\sqrt{n})$ bits communication is required with… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13427v1-abstract-full').style.display = 'inline'; document.getElementById('2502.13427v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.13427v1-abstract-full" style="display: none;"> Buhrman, Cleve, Watrous, and de Wolf (PRL 2001) discovered the quantum fingerprinting protocol, which is the quantum SMP protocol with $O(\log n)$ qubits communication for the equality problem. In the protocol, Alice and Bob create some quantum fingerprints of their inputs, and the referee conducts the SWAP tests for the quantum fingerprints. Since $惟(\sqrt{n})$ bits communication is required with the classical SMP scheme for the equality problem first shown by Newman and Szegedy (STOC 1996), there exists an exponential quantum advantage in the amount of communication. In this paper, we consider a setting in which the referee can do only incoherent measurements rather than coherent measurements including the SWAP tests. We first show that, in the case of one-way LOCC measurements, $惟(\sqrt{n})$ qubits communication is required. To prove the result, we derive a new method to replace quantum messages by classical messages and consider a reduction to the optimal lower bound in the hybrid SMP model where one message is quantum and the other is classical, which was first shown by Klauck and Podder (MFCS 2014). Our method uses the result of Oszmaniec, Guerini, Wittek, and Ac铆n (PRL 2017), who showed that general POVM measurements can be simulated by randomized projective measurements with small ancilla qubits, and Newman's theorem. We further investigate the setting of quantum SMP protocols with two-way LOCC measurements, and derive a lower bound against some restricted two-way LOCC measurements. To prove it, we revisit the technique to replace quantum messages by classical deterministic messages introduced by Aaronson (ToC 2005) and generalized by Gavinsky, Regev, and de Wolf (CJTCS 2008), and show that, using the deterministic message, the referee can simulate the two-way LOCC measurements. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13427v1-abstract-full').style.display = 'none'; document.getElementById('2502.13427v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">33 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.12893">arXiv:2502.12893</a> <span> [<a href="https://arxiv.org/pdf/2502.12893">pdf</a>, <a href="https://arxiv.org/format/2502.12893">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> H-CoT: Hijacking the Chain-of-Thought Safety Reasoning Mechanism to Jailbreak Large Reasoning Models, Including OpenAI o1/o3, DeepSeek-R1, and Gemini 2.0 Flash Thinking </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kuo%2C+M">Martin Kuo</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jianyi Zhang</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+A">Aolin Ding</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qinsi Wang</a>, <a href="/search/cs?searchtype=author&query=DiValentin%2C+L">Louis DiValentin</a>, <a href="/search/cs?searchtype=author&query=Bao%2C+Y">Yujia Bao</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+W">Wei Wei</a>, <a href="/search/cs?searchtype=author&query=Juan%2C+D">Da-Cheng Juan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Hai Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yiran Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.12893v1-abstract-short" style="display: inline;"> Large Reasoning Models (LRMs) have recently extended their powerful reasoning capabilities to safety checks-using chain-of-thought reasoning to decide whether a request should be answered. While this new approach offers a promising route for balancing model utility and safety, its robustness remains underexplored. To address this gap, we introduce Malicious-Educator, a benchmark that disguises ext… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12893v1-abstract-full').style.display = 'inline'; document.getElementById('2502.12893v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.12893v1-abstract-full" style="display: none;"> Large Reasoning Models (LRMs) have recently extended their powerful reasoning capabilities to safety checks-using chain-of-thought reasoning to decide whether a request should be answered. While this new approach offers a promising route for balancing model utility and safety, its robustness remains underexplored. To address this gap, we introduce Malicious-Educator, a benchmark that disguises extremely dangerous or malicious requests beneath seemingly legitimate educational prompts. Our experiments reveal severe security flaws in popular commercial-grade LRMs, including OpenAI o1/o3, DeepSeek-R1, and Gemini 2.0 Flash Thinking. For instance, although OpenAI's o1 model initially maintains a high refusal rate of about 98%, subsequent model updates significantly compromise its safety; and attackers can easily extract criminal strategies from DeepSeek-R1 and Gemini 2.0 Flash Thinking without any additional tricks. To further highlight these vulnerabilities, we propose Hijacking Chain-of-Thought (H-CoT), a universal and transferable attack method that leverages the model's own displayed intermediate reasoning to jailbreak its safety reasoning mechanism. Under H-CoT, refusal rates sharply decline-dropping from 98% to below 2%-and, in some instances, even transform initially cautious tones into ones that are willing to provide harmful content. We hope these findings underscore the urgent need for more robust safety mechanisms to preserve the benefits of advanced reasoning capabilities without compromising ethical standards. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12893v1-abstract-full').style.display = 'none'; document.getElementById('2502.12893v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.12330">arXiv:2502.12330</a> <span> [<a href="https://arxiv.org/pdf/2502.12330">pdf</a>, <a href="https://arxiv.org/format/2502.12330">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> X-IL: Exploring the Design Space of Imitation Learning Policies </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jia%2C+X">Xiaogang Jia</a>, <a href="/search/cs?searchtype=author&query=Donat%2C+A">Atalay Donat</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+X">Xi Huang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+X">Xuan Zhao</a>, <a href="/search/cs?searchtype=author&query=Blessing%2C+D">Denis Blessing</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+H">Hongyi Zhou</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H+A">Han A. Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hanyi Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qian Wang</a>, <a href="/search/cs?searchtype=author&query=Lioutikov%2C+R">Rudolf Lioutikov</a>, <a href="/search/cs?searchtype=author&query=Neumann%2C+G">Gerhard Neumann</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.12330v2-abstract-short" style="display: inline;"> Designing modern imitation learning (IL) policies requires making numerous decisions, including the selection of feature encoding, architecture, policy representation, and more. As the field rapidly advances, the range of available options continues to grow, creating a vast and largely unexplored design space for IL policies. In this work, we present X-IL, an accessible open-source framework desig… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12330v2-abstract-full').style.display = 'inline'; document.getElementById('2502.12330v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.12330v2-abstract-full" style="display: none;"> Designing modern imitation learning (IL) policies requires making numerous decisions, including the selection of feature encoding, architecture, policy representation, and more. As the field rapidly advances, the range of available options continues to grow, creating a vast and largely unexplored design space for IL policies. In this work, we present X-IL, an accessible open-source framework designed to systematically explore this design space. The framework's modular design enables seamless swapping of policy components, such as backbones (e.g., Transformer, Mamba, xLSTM) and policy optimization techniques (e.g., Score-matching, Flow-matching). This flexibility facilitates comprehensive experimentation and has led to the discovery of novel policy configurations that outperform existing methods on recent robot learning benchmarks. Our experiments demonstrate not only significant performance gains but also provide valuable insights into the strengths and weaknesses of various design choices. This study serves as both a practical reference for practitioners and a foundation for guiding future research in imitation learning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12330v2-abstract-full').style.display = 'none'; document.getElementById('2502.12330v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11358">arXiv:2502.11358</a> <span> [<a href="https://arxiv.org/pdf/2502.11358">pdf</a>, <a href="https://arxiv.org/format/2502.11358">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Mimicking the Familiar: Dynamic Command Generation for Information Theft Attacks in LLM Tool-Learning System </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Ziyou Jiang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Mingyang Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+G">Guowei Yang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Junjie Wang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yuekai Huang</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+Z">Zhiyuan Chang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qing Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11358v1-abstract-short" style="display: inline;"> Information theft attacks pose a significant risk to Large Language Model (LLM) tool-learning systems. Adversaries can inject malicious commands through compromised tools, manipulating LLMs to send sensitive information to these tools, which leads to potential privacy breaches. However, existing attack approaches are black-box oriented and rely on static commands that cannot adapt flexibly to the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11358v1-abstract-full').style.display = 'inline'; document.getElementById('2502.11358v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11358v1-abstract-full" style="display: none;"> Information theft attacks pose a significant risk to Large Language Model (LLM) tool-learning systems. Adversaries can inject malicious commands through compromised tools, manipulating LLMs to send sensitive information to these tools, which leads to potential privacy breaches. However, existing attack approaches are black-box oriented and rely on static commands that cannot adapt flexibly to the changes in user queries and the invocation chain of tools. It makes malicious commands more likely to be detected by LLM and leads to attack failure. In this paper, we propose AutoCMD, a dynamic attack comment generation approach for information theft attacks in LLM tool-learning systems. Inspired by the concept of mimicking the familiar, AutoCMD is capable of inferring the information utilized by upstream tools in the toolchain through learning on open-source systems and reinforcement with target system examples, thereby generating more targeted commands for information theft. The evaluation results show that AutoCMD outperforms the baselines with +13.2% $ASR_{Theft}$, and can be generalized to new tool-learning systems to expose their information leakage risks. We also design four defense methods to effectively protect tool-learning systems from the attack. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11358v1-abstract-full').style.display = 'none'; document.getElementById('2502.11358v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages, 11 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11347">arXiv:2502.11347</a> <span> [<a href="https://arxiv.org/pdf/2502.11347">pdf</a>, <a href="https://arxiv.org/format/2502.11347">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Performance">cs.PF</span> </div> </div> <p class="title is-5 mathjax"> Evaluating the Performance of the DeepSeek Model in Confidential Computing Environment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dong%2C+B">Ben Dong</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qian Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11347v1-abstract-short" style="display: inline;"> The increasing adoption of Large Language Models (LLMs) in cloud environments raises critical security concerns, particularly regarding model confidentiality and data privacy. Confidential computing, enabled by Trusted Execution Environments (TEEs), offers a promising solution to mitigate these risks. However, existing TEE implementations, primarily CPU-based, struggle to efficiently support the r… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11347v1-abstract-full').style.display = 'inline'; document.getElementById('2502.11347v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11347v1-abstract-full" style="display: none;"> The increasing adoption of Large Language Models (LLMs) in cloud environments raises critical security concerns, particularly regarding model confidentiality and data privacy. Confidential computing, enabled by Trusted Execution Environments (TEEs), offers a promising solution to mitigate these risks. However, existing TEE implementations, primarily CPU-based, struggle to efficiently support the resource-intensive nature of LLM inference and training. In this work, we present the first evaluation of the DeepSeek model within a TEE-enabled confidential computing environment, specifically utilizing Intel Trust Domain Extensions (TDX). Our study benchmarks DeepSeek's performance across CPU-only, CPU-GPU hybrid, and TEE-based implementations. For smaller parameter sets, such as DeepSeek-R1-1.5B, the TDX implementation outperforms the CPU version in executing computations within a secure environment. It highlights the potential for efficiently deploying LLM models on resource-constrained systems while ensuring security. The overall GPU-to-CPU performance ratio averages 12 across different model sizes, with smaller models exhibiting a lower ratio. Additionally, we provide foundational insights and guidance on optimizing CPU-GPU confidential computing solutions for scalable and secure AI deployments. Our findings contribute to the advancement of privacy-preserving AI, paving the way for efficient and secure LLM inference in confidential computing environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11347v1-abstract-full').style.display = 'none'; document.getElementById('2502.11347v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.10977">arXiv:2502.10977</a> <span> [<a href="https://arxiv.org/pdf/2502.10977">pdf</a>, <a href="https://arxiv.org/format/2502.10977">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Data Structures and Algorithms">cs.DS</span> </div> </div> <p class="title is-5 mathjax"> The Bathroom Model: A Realistic Approach to Hash Table Algorithm Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qiantong Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.10977v1-abstract-short" style="display: inline;"> Hash table search algorithms have been a fundamental research topic in computer science for decades. The widely accepted belief, originating from early theoretical work by Professor Yao, suggests that random probing is the optimal approach for open-addressing hash tables. However, a recent study by an undergraduate at the University of Cambridge challenges this notion, introducing an elastic searc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10977v1-abstract-full').style.display = 'inline'; document.getElementById('2502.10977v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.10977v1-abstract-full" style="display: none;"> Hash table search algorithms have been a fundamental research topic in computer science for decades. The widely accepted belief, originating from early theoretical work by Professor Yao, suggests that random probing is the optimal approach for open-addressing hash tables. However, a recent study by an undergraduate at the University of Cambridge challenges this notion, introducing an elastic search method with fixed interval thresholds. While this approach offers improvements over prior methods, we argue that its reliance on static threshold values limits its theoretical optimality. In this paper, we present the Bathroom Model, a novel approach to hash table search optimization inspired by real-world stall selection behavior. Unlike existing techniques, our method dynamically adjusts search strategies based on prior occupancy information, resulting in a more efficient probing mechanism. We formalize this model, analyze its theoretical performance, and compare it against state-of-the-art hash table search methods. Our results demonstrate that adaptive probing strategies significantly enhance lookup performance while maintaining low computational overhead. This research highlights the potential for fundamental algorithmic advancements in long-established domains and suggests new directions for optimizing hash table performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10977v1-abstract-full').style.display = 'none'; document.getElementById('2502.10977v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.10833">arXiv:2502.10833</a> <span> [<a href="https://arxiv.org/pdf/2502.10833">pdf</a>, <a href="https://arxiv.org/format/2502.10833">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Order-agnostic Identifier for Large Language Model-based Generative Recommendation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+X">Xinyu Lin</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+H">Haihan Shi</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wenjie Wang</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+F">Fuli Feng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qifan Wang</a>, <a href="/search/cs?searchtype=author&query=Ng%2C+S">See-Kiong Ng</a>, <a href="/search/cs?searchtype=author&query=Chua%2C+T">Tat-Seng Chua</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.10833v1-abstract-short" style="display: inline;"> Leveraging Large Language Models (LLMs) for generative recommendation has attracted significant research interest, where item tokenization is a critical step. It involves assigning item identifiers for LLMs to encode user history and generate the next item. Existing approaches leverage either token-sequence identifiers, representing items as discrete token sequences, or single-token identifiers, u… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10833v1-abstract-full').style.display = 'inline'; document.getElementById('2502.10833v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.10833v1-abstract-full" style="display: none;"> Leveraging Large Language Models (LLMs) for generative recommendation has attracted significant research interest, where item tokenization is a critical step. It involves assigning item identifiers for LLMs to encode user history and generate the next item. Existing approaches leverage either token-sequence identifiers, representing items as discrete token sequences, or single-token identifiers, using ID or semantic embeddings. Token-sequence identifiers face issues such as the local optima problem in beam search and low generation efficiency due to step-by-step generation. In contrast, single-token identifiers fail to capture rich semantics or encode Collaborative Filtering (CF) information, resulting in suboptimal performance. To address these issues, we propose two fundamental principles for item identifier design: 1) integrating both CF and semantic information to fully capture multi-dimensional item information, and 2) designing order-agnostic identifiers without token dependency, mitigating the local optima issue and achieving simultaneous generation for generation efficiency. Accordingly, we introduce a novel set identifier paradigm for LLM-based generative recommendation, representing each item as a set of order-agnostic tokens. To implement this paradigm, we propose SETRec, which leverages CF and semantic tokenizers to obtain order-agnostic multi-dimensional tokens. To eliminate token dependency, SETRec uses a sparse attention mask for user history encoding and a query-guided generation mechanism for simultaneous token generation. We instantiate SETRec on T5 and Qwen (from 1.5B to 7B). Extensive experiments demonstrate its effectiveness under various scenarios (e.g., full ranking, warm- and cold-start ranking, and various item popularity groups). Moreover, results validate SETRec's superior efficiency and show promising scalability on cold-start items as model sizes increase. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10833v1-abstract-full').style.display = 'none'; document.getElementById('2502.10833v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.10678">arXiv:2502.10678</a> <span> [<a href="https://arxiv.org/pdf/2502.10678">pdf</a>, <a href="https://arxiv.org/format/2502.10678">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3706598.3714238">10.1145/3706598.3714238 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> GenComUI: Exploring Generative Visual Aids as Medium to Support Task-Oriented Human-Robot Communication </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ge%2C+Y">Yate Ge</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Meiying Li</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+X">Xipeng Huang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yuanda Hu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qi Wang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+X">Xiaohua Sun</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+W">Weiwei Guo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.10678v1-abstract-short" style="display: inline;"> This work investigates the integration of generative visual aids in human-robot task communication. We developed GenComUI, a system powered by large language models that dynamically generates contextual visual aids (such as map annotations, path indicators, and animations) to support verbal task communication and facilitate the generation of customized task programs for the robot. This system was… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10678v1-abstract-full').style.display = 'inline'; document.getElementById('2502.10678v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.10678v1-abstract-full" style="display: none;"> This work investigates the integration of generative visual aids in human-robot task communication. We developed GenComUI, a system powered by large language models that dynamically generates contextual visual aids (such as map annotations, path indicators, and animations) to support verbal task communication and facilitate the generation of customized task programs for the robot. This system was informed by a formative study that examined how humans use external visual tools to assist verbal communication in spatial tasks. To evaluate its effectiveness, we conducted a user experiment (n = 20) comparing GenComUI with a voice-only baseline. The results demonstrate that generative visual aids, through both qualitative and quantitative analysis, enhance verbal task communication by providing continuous visual feedback, thus promoting natural and effective human-robot communication. Additionally, the study offers a set of design implications, emphasizing how dynamically generated visual aids can serve as an effective communication medium in human-robot interaction. These findings underscore the potential of generative visual aids to inform the design of more intuitive and effective human-robot communication, particularly for complex communication scenarios in human-robot interaction and LLM-based end-user development. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10678v1-abstract-full').style.display = 'none'; document.getElementById('2502.10678v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To appear at ACM CHI '25</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> H.5.2; H.5.3; I.2.7; I.2.0 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.10667">arXiv:2502.10667</a> <span> [<a href="https://arxiv.org/pdf/2502.10667">pdf</a>, <a href="https://arxiv.org/format/2502.10667">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> </div> </div> <p class="title is-5 mathjax"> Automated Data Quality Validation in an End-to-End GNN Framework </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dong%2C+S">Sijie Dong</a>, <a href="/search/cs?searchtype=author&query=Sahri%2C+S">Soror Sahri</a>, <a href="/search/cs?searchtype=author&query=Palpanas%2C+T">Themis Palpanas</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qitong Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.10667v1-abstract-short" style="display: inline;"> Ensuring data quality is crucial in modern data ecosystems, especially for training or testing datasets in machine learning. Existing validation approaches rely on computing data quality metrics and/or using expert-defined constraints. Although there are automated constraint generation methods, they are often incomplete and may be too strict or too soft, causing false positives or missed errors, t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10667v1-abstract-full').style.display = 'inline'; document.getElementById('2502.10667v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.10667v1-abstract-full" style="display: none;"> Ensuring data quality is crucial in modern data ecosystems, especially for training or testing datasets in machine learning. Existing validation approaches rely on computing data quality metrics and/or using expert-defined constraints. Although there are automated constraint generation methods, they are often incomplete and may be too strict or too soft, causing false positives or missed errors, thus requiring expert adjustment. These methods may also fail to detect subtle data inconsistencies hidden by complex interdependencies within the data. In this paper, we propose DQuag, an end-to-end data quality validation and repair framework based on an improved Graph Neural Network (GNN) and multi-task learning. The proposed method incorporates a dual-decoder design: one for data quality validation and the other for data repair. Our approach captures complex feature relationships within tabular datasets using a multi-layer GNN architecture to automatically detect explicit and hidden data errors. Unlike previous methods, our model does not require manual input for constraint generation and learns the underlying feature dependencies, enabling it to identify complex hidden errors that traditional systems often miss. Moreover, it can recommend repair values, improving overall data quality. Experimental results validate the effectiveness of our approach in identifying and resolving data quality issues. The paper appeared in EDBT 2025. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10667v1-abstract-full').style.display = 'none'; document.getElementById('2502.10667v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.09662">arXiv:2502.09662</a> <span> [<a href="https://arxiv.org/pdf/2502.09662">pdf</a>, <a href="https://arxiv.org/format/2502.09662">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantitative Methods">q-bio.QM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Generalizable Cervical Cancer Screening via Large-scale Pretraining and Test-Time Adaptation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+H">Hao Jiang</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+C">Cheng Jin</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H">Huangjing Lin</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yanning Zhou</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xi Wang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+J">Jiabo Ma</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+L">Li Ding</a>, <a href="/search/cs?searchtype=author&query=Hou%2C+J">Jun Hou</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+R">Runsheng Liu</a>, <a href="/search/cs?searchtype=author&query=Chai%2C+Z">Zhizhong Chai</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+L">Luyang Luo</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+H">Huijuan Shi</a>, <a href="/search/cs?searchtype=author&query=Qian%2C+Y">Yinling Qian</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qiong Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Changzhong Li</a>, <a href="/search/cs?searchtype=author&query=Han%2C+A">Anjia Han</a>, <a href="/search/cs?searchtype=author&query=Chan%2C+R+C+K">Ronald Cheong Kin Chan</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hao Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.09662v1-abstract-short" style="display: inline;"> Cervical cancer is a leading malignancy in female reproductive system. While AI-assisted cytology offers a cost-effective and non-invasive screening solution, current systems struggle with generalizability in complex clinical scenarios. To address this issue, we introduced Smart-CCS, a generalizable Cervical Cancer Screening paradigm based on pretraining and adaptation to create robust and general… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09662v1-abstract-full').style.display = 'inline'; document.getElementById('2502.09662v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.09662v1-abstract-full" style="display: none;"> Cervical cancer is a leading malignancy in female reproductive system. While AI-assisted cytology offers a cost-effective and non-invasive screening solution, current systems struggle with generalizability in complex clinical scenarios. To address this issue, we introduced Smart-CCS, a generalizable Cervical Cancer Screening paradigm based on pretraining and adaptation to create robust and generalizable screening systems. To develop and validate Smart-CCS, we first curated a large-scale, multi-center dataset named CCS-127K, which comprises a total of 127,471 cervical cytology whole-slide images collected from 48 medical centers. By leveraging large-scale self-supervised pretraining, our CCS models are equipped with strong generalization capability, potentially generalizing across diverse scenarios. Then, we incorporated test-time adaptation to specifically optimize the trained CCS model for complex clinical settings, which adapts and refines predictions, improving real-world applicability. We conducted large-scale system evaluation among various cohorts. In retrospective cohorts, Smart-CCS achieved an overall area under the curve (AUC) value of 0.965 and sensitivity of 0.913 for cancer screening on 11 internal test datasets. In external testing, system performance maintained high at 0.950 AUC across 6 independent test datasets. In prospective cohorts, our Smart-CCS achieved AUCs of 0.947, 0.924, and 0.986 in three prospective centers, respectively. Moreover, the system demonstrated superior sensitivity in diagnosing cervical cancer, confirming the accuracy of our cancer screening results by using histology findings for validation. Interpretability analysis with cell and slide predictions further indicated that the system's decision-making aligns with clinical practice. Smart-CCS represents a significant advancement in cancer screening across diverse clinical contexts. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09662v1-abstract-full').style.display = 'none'; document.getElementById('2502.09662v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.09560">arXiv:2502.09560</a> <span> [<a href="https://arxiv.org/pdf/2502.09560">pdf</a>, <a href="https://arxiv.org/format/2502.09560">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> EmbodiedBench: Comprehensive Benchmarking Multi-modal Large Language Models for Vision-Driven Embodied Agents </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+R">Rui Yang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hanyang Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Junyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+M">Mark Zhao</a>, <a href="/search/cs?searchtype=author&query=Qian%2C+C">Cheng Qian</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+K">Kangrui Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qineng Wang</a>, <a href="/search/cs?searchtype=author&query=Koripella%2C+T+V">Teja Venkat Koripella</a>, <a href="/search/cs?searchtype=author&query=Movahedi%2C+M">Marziyeh Movahedi</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Manling Li</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+H">Heng Ji</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Huan Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tong Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.09560v1-abstract-short" style="display: inline;"> Leveraging Multi-modal Large Language Models (MLLMs) to create embodied agents offers a promising avenue for tackling real-world tasks. While language-centric embodied agents have garnered substantial attention, MLLM-based embodied agents remain underexplored due to the lack of comprehensive evaluation frameworks. To bridge this gap, we introduce EmbodiedBench, an extensive benchmark designed to e… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09560v1-abstract-full').style.display = 'inline'; document.getElementById('2502.09560v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.09560v1-abstract-full" style="display: none;"> Leveraging Multi-modal Large Language Models (MLLMs) to create embodied agents offers a promising avenue for tackling real-world tasks. While language-centric embodied agents have garnered substantial attention, MLLM-based embodied agents remain underexplored due to the lack of comprehensive evaluation frameworks. To bridge this gap, we introduce EmbodiedBench, an extensive benchmark designed to evaluate vision-driven embodied agents. EmbodiedBench features: (1) a diverse set of 1,128 testing tasks across four environments, ranging from high-level semantic tasks (e.g., household) to low-level tasks involving atomic actions (e.g., navigation and manipulation); and (2) six meticulously curated subsets evaluating essential agent capabilities like commonsense reasoning, complex instruction understanding, spatial awareness, visual perception, and long-term planning. Through extensive experiments, we evaluated 13 leading proprietary and open-source MLLMs within EmbodiedBench. Our findings reveal that: MLLMs excel at high-level tasks but struggle with low-level manipulation, with the best model, GPT-4o, scoring only 28.9% on average. EmbodiedBench provides a multifaceted standardized evaluation platform that not only highlights existing challenges but also offers valuable insights to advance MLLM-based embodied agents. Our code is available at https://embodiedbench.github.io. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09560v1-abstract-full').style.display = 'none'; document.getElementById('2502.09560v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">51 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.09080">arXiv:2502.09080</a> <span> [<a href="https://arxiv.org/pdf/2502.09080">pdf</a>, <a href="https://arxiv.org/format/2502.09080">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> BevSplat: Resolving Height Ambiguity via Feature-Based Gaussian Primitives for Weakly-Supervised Cross-View Localization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qiwei Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+S">Shaoxun Wu</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yujiao Shi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.09080v1-abstract-short" style="display: inline;"> This paper addresses the problem of weakly supervised cross-view localization, where the goal is to estimate the pose of a ground camera relative to a satellite image with noisy ground truth annotations. A common approach to bridge the cross-view domain gap for pose estimation is Bird's-Eye View (BEV) synthesis. However, existing methods struggle with height ambiguity due to the lack of depth info… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09080v1-abstract-full').style.display = 'inline'; document.getElementById('2502.09080v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.09080v1-abstract-full" style="display: none;"> This paper addresses the problem of weakly supervised cross-view localization, where the goal is to estimate the pose of a ground camera relative to a satellite image with noisy ground truth annotations. A common approach to bridge the cross-view domain gap for pose estimation is Bird's-Eye View (BEV) synthesis. However, existing methods struggle with height ambiguity due to the lack of depth information in ground images and satellite height maps. Previous solutions either assume a flat ground plane or rely on complex models, such as cross-view transformers. We propose BevSplat, a novel method that resolves height ambiguity by using feature-based Gaussian primitives. Each pixel in the ground image is represented by a 3D Gaussian with semantic and spatial features, which are synthesized into a BEV feature map for relative pose estimation. Additionally, to address challenges with panoramic query images, we introduce an icosphere-based supervision strategy for the Gaussian primitives. We validate our method on the widely used KITTI and VIGOR datasets, which include both pinhole and panoramic query images. Experimental results show that BevSplat significantly improves localization accuracy over prior approaches. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09080v1-abstract-full').style.display = 'none'; document.getElementById('2502.09080v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.09029">arXiv:2502.09029</a> <span> [<a href="https://arxiv.org/pdf/2502.09029">pdf</a>, <a href="https://arxiv.org/format/2502.09029">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> MTDP: Modulated Transformer Diffusion Policy Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qianhao Wang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yinqian Sun</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+E">Enmeng Lu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Qian Zhang</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+Y">Yi Zeng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.09029v1-abstract-short" style="display: inline;"> Recent research on robot manipulation based on Behavior Cloning (BC) has made significant progress. By combining diffusion models with BC, diffusion policiy has been proposed, enabling robots to quickly learn manipulation tasks with high success rates. However, integrating diffusion policy with high-capacity Transformer presents challenges, traditional Transformer architectures struggle to effecti… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09029v1-abstract-full').style.display = 'inline'; document.getElementById('2502.09029v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.09029v1-abstract-full" style="display: none;"> Recent research on robot manipulation based on Behavior Cloning (BC) has made significant progress. By combining diffusion models with BC, diffusion policiy has been proposed, enabling robots to quickly learn manipulation tasks with high success rates. However, integrating diffusion policy with high-capacity Transformer presents challenges, traditional Transformer architectures struggle to effectively integrate guiding conditions, resulting in poor performance in manipulation tasks when using Transformer-based models. In this paper, we investigate key architectural designs of Transformers and improve the traditional Transformer architecture by proposing the Modulated Transformer Diffusion Policy (MTDP) model for diffusion policy. The core of this model is the Modulated Attention module we proposed, which more effectively integrates the guiding conditions with the main input, improving the generative model's output quality and, consequently, increasing the robot's task success rate. In six experimental tasks, MTDP outperformed existing Transformer model architectures, particularly in the Toolhang experiment, where the success rate increased by 12\%. To verify the generality of Modulated Attention, we applied it to the UNet architecture to construct Modulated UNet Diffusion Policy model (MUDP), which also achieved higher success rates than existing UNet architectures across all six experiments. The Diffusion Policy uses Denoising Diffusion Probabilistic Models (DDPM) as the diffusion model. Building on this, we also explored Denoising Diffusion Implicit Models (DDIM) as the diffusion model, constructing the MTDP-I and MUDP-I model, which nearly doubled the generation speed while maintaining performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09029v1-abstract-full').style.display = 'none'; document.getElementById('2502.09029v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.08880">arXiv:2502.08880</a> <span> [<a href="https://arxiv.org/pdf/2502.08880">pdf</a>, <a href="https://arxiv.org/format/2502.08880">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantum Physics">quant-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> </div> </div> <p class="title is-5 mathjax"> Quantum Trojan Insertion: Controlled Activation for Covert Circuit Manipulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=John%2C+J">Jayden John</a>, <a href="/search/cs?searchtype=author&query=Golla%2C+L">Lakshman Golla</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qian Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.08880v1-abstract-short" style="display: inline;"> Quantum computing has demonstrated superior efficiency compared to classical computing. Quantum circuits are essential for implementing functions and achieving correct computational outcomes. Quantum circuit compilers, which translate high-level quantum operations into hardware-specific gates while optimizing performance, serve as the interface between the quantum software stack and physical quant… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08880v1-abstract-full').style.display = 'inline'; document.getElementById('2502.08880v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.08880v1-abstract-full" style="display: none;"> Quantum computing has demonstrated superior efficiency compared to classical computing. Quantum circuits are essential for implementing functions and achieving correct computational outcomes. Quantum circuit compilers, which translate high-level quantum operations into hardware-specific gates while optimizing performance, serve as the interface between the quantum software stack and physical quantum machines. However, untrusted compilers can introduce malicious hardware Trojans into quantum circuits, altering their functionality and leading to incorrect results. In the world of classical computing, effective hardware Trojans are a critical threat to integrated circuits. This process often involves stealthily inserting conditional logic gates that activate under specific input conditions. In this paper, we propose a novel advanced quantum Trojan that is controllable, allowing it to be activated or deactivated under different circumstances. These Trojans remain dormant until triggered by predefined input conditions, making detection challenging. Through a series of benchmark experiments, we demonstrate the feasibility of this method by evaluating the effectiveness of embedding controlled trojans in quantum circuits and measuring their impact on circuit performance and security. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08880v1-abstract-full').style.display = 'none'; document.getElementById('2502.08880v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.08639">arXiv:2502.08639</a> <span> [<a href="https://arxiv.org/pdf/2502.08639">pdf</a>, <a href="https://arxiv.org/format/2502.08639">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> CineMaster: A 3D-Aware and Controllable Framework for Cinematic Text-to-Video Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qinghe Wang</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+Y">Yawen Luo</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+X">Xiaoyu Shi</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+X">Xu Jia</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Huchuan Lu</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+T">Tianfan Xue</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xintao Wang</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+P">Pengfei Wan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+D">Di Zhang</a>, <a href="/search/cs?searchtype=author&query=Gai%2C+K">Kun Gai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.08639v1-abstract-short" style="display: inline;"> In this work, we present CineMaster, a novel framework for 3D-aware and controllable text-to-video generation. Our goal is to empower users with comparable controllability as professional film directors: precise placement of objects within the scene, flexible manipulation of both objects and camera in 3D space, and intuitive layout control over the rendered frames. To achieve this, CineMaster oper… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08639v1-abstract-full').style.display = 'inline'; document.getElementById('2502.08639v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.08639v1-abstract-full" style="display: none;"> In this work, we present CineMaster, a novel framework for 3D-aware and controllable text-to-video generation. Our goal is to empower users with comparable controllability as professional film directors: precise placement of objects within the scene, flexible manipulation of both objects and camera in 3D space, and intuitive layout control over the rendered frames. To achieve this, CineMaster operates in two stages. In the first stage, we design an interactive workflow that allows users to intuitively construct 3D-aware conditional signals by positioning object bounding boxes and defining camera movements within the 3D space. In the second stage, these control signals--comprising rendered depth maps, camera trajectories and object class labels--serve as the guidance for a text-to-video diffusion model, ensuring to generate the user-intended video content. Furthermore, to overcome the scarcity of in-the-wild datasets with 3D object motion and camera pose annotations, we carefully establish an automated data annotation pipeline that extracts 3D bounding boxes and camera trajectories from large-scale video data. Extensive qualitative and quantitative experiments demonstrate that CineMaster significantly outperforms existing methods and implements prominent 3D-aware text-to-video generation. Project page: https://cinemaster-dev.github.io/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08639v1-abstract-full').style.display = 'none'; document.getElementById('2502.08639v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.08262">arXiv:2502.08262</a> <span> [<a href="https://arxiv.org/pdf/2502.08262">pdf</a>, <a href="https://arxiv.org/format/2502.08262">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> GenIAS: Generator for Instantiating Anomalies in time Series </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Darban%2C+Z+Z">Zahra Zamanzadeh Darban</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qizhou Wang</a>, <a href="/search/cs?searchtype=author&query=Webb%2C+G+I">Geoffrey I. Webb</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+S">Shirui Pan</a>, <a href="/search/cs?searchtype=author&query=Aggarwal%2C+C+C">Charu C. Aggarwal</a>, <a href="/search/cs?searchtype=author&query=Salehi%2C+M">Mahsa Salehi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.08262v1-abstract-short" style="display: inline;"> A recent and promising approach for building time series anomaly detection (TSAD) models is to inject synthetic samples of anomalies within real data sets. The existing injection mechanisms have significant limitations - most of them rely on ad hoc, hand-crafted strategies which fail to capture the natural diversity of anomalous patterns, or are restricted to univariate time series settings. To ad… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08262v1-abstract-full').style.display = 'inline'; document.getElementById('2502.08262v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.08262v1-abstract-full" style="display: none;"> A recent and promising approach for building time series anomaly detection (TSAD) models is to inject synthetic samples of anomalies within real data sets. The existing injection mechanisms have significant limitations - most of them rely on ad hoc, hand-crafted strategies which fail to capture the natural diversity of anomalous patterns, or are restricted to univariate time series settings. To address these challenges, we design a generative model for TSAD using a variational autoencoder, which is referred to as a Generator for Instantiating Anomalies in Time Series (GenIAS). GenIAS is designed to produce diverse and realistic synthetic anomalies for TSAD tasks. By employing a novel learned perturbation mechanism in the latent space and injecting the perturbed patterns in different segments of time series, GenIAS can generate anomalies with greater diversity and varying scales. Further, guided by a new triplet loss function, which uses a min-max margin and a new variance-scaling approach to further enforce the learning of compact normal patterns, GenIAS ensures that anomalies are distinct from normal samples while remaining realistic. The approach is effective for both univariate and multivariate time series. We demonstrate the diversity and realism of the generated anomalies. Our extensive experiments demonstrate that GenIAS - when integrated into a TSAD task - consistently outperforms seventeen traditional and deep anomaly detection models, thereby highlighting the potential of generative models for time series anomaly generation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08262v1-abstract-full').style.display = 'none'; document.getElementById('2502.08262v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.08132">arXiv:2502.08132</a> <span> [<a href="https://arxiv.org/pdf/2502.08132">pdf</a>, <a href="https://arxiv.org/format/2502.08132">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> SS4Rec: Continuous-Time Sequential Recommendation with State Space Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiao%2C+W">Wei Xiao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Huiying Wang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Q">Qifeng Zhou</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qing Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.08132v1-abstract-short" style="display: inline;"> Sequential recommendation is a key area in the field of recommendation systems aiming to model user interest based on historical interaction sequences with irregular intervals. While previous recurrent neural network-based and attention-based approaches have achieved significant results, they have limitations in capturing system continuity due to the discrete characteristics. In the context of con… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08132v1-abstract-full').style.display = 'inline'; document.getElementById('2502.08132v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.08132v1-abstract-full" style="display: none;"> Sequential recommendation is a key area in the field of recommendation systems aiming to model user interest based on historical interaction sequences with irregular intervals. While previous recurrent neural network-based and attention-based approaches have achieved significant results, they have limitations in capturing system continuity due to the discrete characteristics. In the context of continuous-time modeling, state space model (SSM) offers a potential solution, as it can effectively capture the dynamic evolution of user interest over time. However, existing SSM-based approaches ignore the impact of irregular time intervals within historical user interactions, making it difficult to model complexed user-item transitions in sequences. To address this issue, we propose a hybrid SSM-based model called SS4Rec for continuous-time sequential recommendation. SS4Rec integrates a time-aware SSM to handle irregular time intervals and a relation-aware SSM to model contextual dependencies, enabling it to infer user interest from both temporal and sequential perspectives. In the training process, the time-aware SSM and the relation-aware SSM are discretized by variable stepsizes according to user interaction time intervals and input data, respectively. This helps capture the continuous dependency from irregular time intervals and provides time-specific personalized recommendations. Experimental studies on five benchmark datasets demonstrate the superiority and effectiveness of SS4Rec. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08132v1-abstract-full').style.display = 'none'; document.getElementById('2502.08132v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07602">arXiv:2502.07602</a> <span> [<a href="https://arxiv.org/pdf/2502.07602">pdf</a>, <a href="https://arxiv.org/format/2502.07602">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> </div> </div> <p class="title is-5 mathjax"> An Improved Optimal Proximal Gradient Algorithm for Non-Blind Image Deblurring </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qingsong Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+S">Shengze Xu</a>, <a href="/search/cs?searchtype=author&query=Tong%2C+X">Xiaojiao Tong</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+T">Tieyong Zeng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07602v1-abstract-short" style="display: inline;"> Image deblurring remains a central research area within image processing, critical for its role in enhancing image quality and facilitating clearer visual representations across diverse applications. This paper tackles the optimization problem of image deblurring, assuming a known blurring kernel. We introduce an improved optimal proximal gradient algorithm (IOptISTA), which builds upon the optima… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07602v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07602v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07602v1-abstract-full" style="display: none;"> Image deblurring remains a central research area within image processing, critical for its role in enhancing image quality and facilitating clearer visual representations across diverse applications. This paper tackles the optimization problem of image deblurring, assuming a known blurring kernel. We introduce an improved optimal proximal gradient algorithm (IOptISTA), which builds upon the optimal gradient method and a weighting matrix, to efficiently address the non-blind image deblurring problem. Based on two regularization cases, namely the $l_1$ norm and total variation norm, we perform numerical experiments to assess the performance of our proposed algorithm. The results indicate that our algorithm yields enhanced PSNR and SSIM values, as well as a reduced tolerance, compared to existing methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07602v1-abstract-full').style.display = 'none'; document.getElementById('2502.07602v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07557">arXiv:2502.07557</a> <span> [<a href="https://arxiv.org/pdf/2502.07557">pdf</a>, <a href="https://arxiv.org/format/2502.07557">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> JBShield: Defending Large Language Models from Jailbreak Attacks through Activated Concept Analysis and Manipulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shenyi Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhai%2C+Y">Yuchen Zhai</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+K">Keyan Guo</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+H">Hongxin Hu</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+S">Shengnan Guo</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Z">Zheng Fang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+L">Lingchen Zhao</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+C">Chao Shen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Cong Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qian Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07557v1-abstract-short" style="display: inline;"> Despite the implementation of safety alignment strategies, large language models (LLMs) remain vulnerable to jailbreak attacks, which undermine these safety guardrails and pose significant security threats. Some defenses have been proposed to detect or mitigate jailbreaks, but they are unable to withstand the test of time due to an insufficient understanding of jailbreak mechanisms. In this work,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07557v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07557v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07557v1-abstract-full" style="display: none;"> Despite the implementation of safety alignment strategies, large language models (LLMs) remain vulnerable to jailbreak attacks, which undermine these safety guardrails and pose significant security threats. Some defenses have been proposed to detect or mitigate jailbreaks, but they are unable to withstand the test of time due to an insufficient understanding of jailbreak mechanisms. In this work, we investigate the mechanisms behind jailbreaks based on the Linear Representation Hypothesis (LRH), which states that neural networks encode high-level concepts as subspaces in their hidden representations. We define the toxic semantics in harmful and jailbreak prompts as toxic concepts and describe the semantics in jailbreak prompts that manipulate LLMs to comply with unsafe requests as jailbreak concepts. Through concept extraction and analysis, we reveal that LLMs can recognize the toxic concepts in both harmful and jailbreak prompts. However, unlike harmful prompts, jailbreak prompts activate the jailbreak concepts and alter the LLM output from rejection to compliance. Building on our analysis, we propose a comprehensive jailbreak defense framework, JBShield, consisting of two key components: jailbreak detection JBShield-D and mitigation JBShield-M. JBShield-D identifies jailbreak prompts by determining whether the input activates both toxic and jailbreak concepts. When a jailbreak prompt is detected, JBShield-M adjusts the hidden representations of the target LLM by enhancing the toxic concept and weakening the jailbreak concept, ensuring LLMs produce safe content. Extensive experiments demonstrate the superior performance of JBShield, achieving an average detection accuracy of 0.95 and reducing the average attack success rate of various jailbreak attacks to 2% from 61% across distinct LLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07557v1-abstract-full').style.display = 'none'; document.getElementById('2502.07557v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To Appear in the 34rd USENIX Security Symposium, August 13-15, 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07527">arXiv:2502.07527</a> <span> [<a href="https://arxiv.org/pdf/2502.07527">pdf</a>, <a href="https://arxiv.org/format/2502.07527">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> NatureLM: Deciphering the Language of Nature for Scientific Discovery </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xia%2C+Y">Yingce Xia</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+P">Peiran Jin</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+S">Shufang Xie</a>, <a href="/search/cs?searchtype=author&query=He%2C+L">Liang He</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+C">Chuan Cao</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+R">Renqian Luo</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+G">Guoqing Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yue Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zequn Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yuan-Jyue Chen</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Z">Zekun Guo</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+Y">Yeqi Bai</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+P">Pan Deng</a>, <a href="/search/cs?searchtype=author&query=Min%2C+Y">Yaosen Min</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Z">Ziheng Lu</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+H">Hongxia Hao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+H">Han Yang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jielan Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+C">Chang Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jia Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jianwei Zhu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+K">Kehan Wu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wei Zhang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+K">Kaiyuan Gao</a>, <a href="/search/cs?searchtype=author&query=Pei%2C+Q">Qizhi Pei</a> , et al. (20 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07527v1-abstract-short" style="display: inline;"> Foundation models have revolutionized natural language processing and artificial intelligence, significantly enhancing how machines comprehend and generate human languages. Inspired by the success of these foundation models, researchers have developed foundation models for individual scientific domains, including small molecules, materials, proteins, DNA, and RNA. However, these models are typical… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07527v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07527v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07527v1-abstract-full" style="display: none;"> Foundation models have revolutionized natural language processing and artificial intelligence, significantly enhancing how machines comprehend and generate human languages. Inspired by the success of these foundation models, researchers have developed foundation models for individual scientific domains, including small molecules, materials, proteins, DNA, and RNA. However, these models are typically trained in isolation, lacking the ability to integrate across different scientific domains. Recognizing that entities within these domains can all be represented as sequences, which together form the "language of nature", we introduce Nature Language Model (briefly, NatureLM), a sequence-based science foundation model designed for scientific discovery. Pre-trained with data from multiple scientific domains, NatureLM offers a unified, versatile model that enables various applications including: (i) generating and optimizing small molecules, proteins, RNA, and materials using text instructions; (ii) cross-domain generation/design, such as protein-to-molecule and protein-to-RNA generation; and (iii) achieving state-of-the-art performance in tasks like SMILES-to-IUPAC translation and retrosynthesis on USPTO-50k. NatureLM offers a promising generalist approach for various scientific tasks, including drug discovery (hit generation/optimization, ADMET optimization, synthesis), novel material design, and the development of therapeutic proteins or nucleotides. We have developed NatureLM models in different sizes (1 billion, 8 billion, and 46.7 billion parameters) and observed a clear improvement in performance as the model size increases. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07527v1-abstract-full').style.display = 'none'; document.getElementById('2502.07527v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">81 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06418">arXiv:2502.06418</a> <span> [<a href="https://arxiv.org/pdf/2502.06418">pdf</a>, <a href="https://arxiv.org/format/2502.06418">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Robust Watermarks Leak: Channel-Aware Feature Extraction Enables Adversarial Watermark Manipulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ba%2C+Z">Zhongjie Ba</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yitao Zhang</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+P">Peng Cheng</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+B">Bin Gong</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xinyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qinglong Wang</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+K">Kui Ren</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06418v1-abstract-short" style="display: inline;"> Watermarking plays a key role in the provenance and detection of AI-generated content. While existing methods prioritize robustness against real-world distortions (e.g., JPEG compression and noise addition), we reveal a fundamental tradeoff: such robust watermarks inherently improve the redundancy of detectable patterns encoded into images, creating exploitable information leakage. To leverage thi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06418v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06418v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06418v1-abstract-full" style="display: none;"> Watermarking plays a key role in the provenance and detection of AI-generated content. While existing methods prioritize robustness against real-world distortions (e.g., JPEG compression and noise addition), we reveal a fundamental tradeoff: such robust watermarks inherently improve the redundancy of detectable patterns encoded into images, creating exploitable information leakage. To leverage this, we propose an attack framework that extracts leakage of watermark patterns through multi-channel feature learning using a pre-trained vision model. Unlike prior works requiring massive data or detector access, our method achieves both forgery and detection evasion with a single watermarked image. Extensive experiments demonstrate that our method achieves a 60\% success rate gain in detection evasion and 51\% improvement in forgery accuracy compared to state-of-the-art methods while maintaining visual fidelity. Our work exposes the robustness-stealthiness paradox: current "robust" watermarks sacrifice security for distortion resistance, providing insights for future watermark design. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06418v1-abstract-full').style.display = 'none'; document.getElementById('2502.06418v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06327">arXiv:2502.06327</a> <span> [<a href="https://arxiv.org/pdf/2502.06327">pdf</a>, <a href="https://arxiv.org/format/2502.06327">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Prompt-Driven Continual Graph Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qi Wang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+T">Tianfei Zhou</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Ye Yuan</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+R">Rui Mao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06327v1-abstract-short" style="display: inline;"> Continual Graph Learning (CGL), which aims to accommodate new tasks over evolving graph data without forgetting prior knowledge, is garnering significant research interest. Mainstream solutions adopt the memory replay-based idea, ie, caching representative data from earlier tasks for retraining the graph model. However, this strategy struggles with scalability issues for constantly evolving graphs… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06327v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06327v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06327v1-abstract-full" style="display: none;"> Continual Graph Learning (CGL), which aims to accommodate new tasks over evolving graph data without forgetting prior knowledge, is garnering significant research interest. Mainstream solutions adopt the memory replay-based idea, ie, caching representative data from earlier tasks for retraining the graph model. However, this strategy struggles with scalability issues for constantly evolving graphs and raises concerns regarding data privacy. Inspired by recent advancements in the prompt-based learning paradigm, this paper introduces a novel prompt-driven continual graph learning (PROMPTCGL) framework, which learns a separate prompt for each incoming task and maintains the underlying graph neural network model fixed. In this way, PROMPTCGL naturally avoids catastrophic forgetting of knowledge from previous tasks. More specifically, we propose hierarchical prompting to instruct the model from both feature- and topology-level to fully address the variability of task graphs in dynamic continual learning. Additionally, we develop a personalized prompt generator to generate tailored prompts for each graph node while minimizing the number of prompts needed, leading to constant memory consumption regardless of the graph scale. Extensive experiments on four benchmarks show that PROMPTCGL achieves superior performance against existing CGL approaches while significantly reducing memory consumption. Our code is available at https://github.com/QiWang98/PromptCGL. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06327v1-abstract-full').style.display = 'none'; document.getElementById('2502.06327v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 7figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05859">arXiv:2502.05859</a> <span> [<a href="https://arxiv.org/pdf/2502.05859">pdf</a>, <a href="https://arxiv.org/format/2502.05859">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SphereFusion: Efficient Panorama Depth Estimation via Gated Fusion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yan%2C+Q">Qingsong Yan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qiang Wang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+K">Kaiyong Zhao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jie Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Bo Li</a>, <a href="/search/cs?searchtype=author&query=Chu%2C+X">Xiaowen Chu</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+F">Fei Deng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05859v1-abstract-short" style="display: inline;"> Due to the rapid development of panorama cameras, the task of estimating panorama depth has attracted significant attention from the computer vision community, especially in applications such as robot sensing and autonomous driving. However, existing methods relying on different projection formats often encounter challenges, either struggling with distortion and discontinuity in the case of equire… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05859v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05859v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05859v1-abstract-full" style="display: none;"> Due to the rapid development of panorama cameras, the task of estimating panorama depth has attracted significant attention from the computer vision community, especially in applications such as robot sensing and autonomous driving. However, existing methods relying on different projection formats often encounter challenges, either struggling with distortion and discontinuity in the case of equirectangular, cubemap, and tangent projections, or experiencing a loss of texture details with the spherical projection. To tackle these concerns, we present SphereFusion, an end-to-end framework that combines the strengths of various projection methods. Specifically, SphereFusion initially employs 2D image convolution and mesh operations to extract two distinct types of features from the panorama image in both equirectangular and spherical projection domains. These features are then projected onto the spherical domain, where a gate fusion module selects the most reliable features for fusion. Finally, SphereFusion estimates panorama depth within the spherical domain. Meanwhile, SphereFusion employs a cache strategy to improve the efficiency of mesh operation. Extensive experiments on three public panorama datasets demonstrate that SphereFusion achieves competitive results with other state-of-the-art methods, while presenting the fastest inference speed at only 17 ms on a 512$\times$1024 panorama image. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05859v1-abstract-full').style.display = 'none'; document.getElementById('2502.05859v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">3DV 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05588">arXiv:2502.05588</a> <span> [<a href="https://arxiv.org/pdf/2502.05588">pdf</a>, <a href="https://arxiv.org/ps/2502.05588">ps</a>, <a href="https://arxiv.org/format/2502.05588">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> </div> </div> <p class="title is-5 mathjax"> Optimizing Information Freshness of IEEE 802.11ax Uplink OFDMA-Based Random Access </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jingwei Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qian Wang</a>, <a href="/search/cs?searchtype=author&query=He"> He</a>, <a href="/search/cs?searchtype=author&query=Chen"> Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05588v1-abstract-short" style="display: inline;"> The latest WiFi standard, IEEE 802.11ax (WiFi 6), introduces a novel uplink random access mechanism called uplink orthogonal frequency division multiple access-based random access (UORA). While existing work has evaluated the performance of UORA using conventional performance metrics, such as throughput and delay, its information freshness performance has not been thoroughly investigated in the li… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05588v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05588v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05588v1-abstract-full" style="display: none;"> The latest WiFi standard, IEEE 802.11ax (WiFi 6), introduces a novel uplink random access mechanism called uplink orthogonal frequency division multiple access-based random access (UORA). While existing work has evaluated the performance of UORA using conventional performance metrics, such as throughput and delay, its information freshness performance has not been thoroughly investigated in the literature. This is of practical significance as WiFi 6 and beyond are expected to support real-time applications. This paper presents the first attempt to fill this gap by investigating the information freshness, quantified by the Age of Information (AoI) metric, in UORA networks. We establish an analytical framework comprising two discrete-time Markov chains (DTMCs) to characterize the transmission states of stations (STAs) in UORA networks. Building on the formulated DTMCs, we derive an analytical expression for the long-term average AoI (AAoI), facilitating the optimization of UORA parameters for enhanced AoI performance through exhaustive search. To gain deeper design insights and improve the effectiveness of UORA parameter optimization, we derive a closed-form expression for the AAoI and its approximated lower bound for a simplified scenario characterized by a fixed backoff contention window and generate-at-will status updates. By analyzing the approximated lower bound of the AAoI, we propose efficient UORA parameter optimization algorithms that can be realized with only a few comparisons of different possible values of the parameters to be optimized. Simulation results validate our analysis and demonstrate that the AAoI achieved through our proposed parameter optimization algorithm closely approximates the optimal AoI performance obtained via exhaustive search, outperforming the round-robin and max-AoI policies in large and low-traffic networks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05588v1-abstract-full').style.display = 'none'; document.getElementById('2502.05588v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.04424">arXiv:2502.04424</a> <span> [<a href="https://arxiv.org/pdf/2502.04424">pdf</a>, <a href="https://arxiv.org/format/2502.04424">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> EmoBench-M: Benchmarking Emotional Intelligence for Multimodal Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+H">He Hu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yucheng Zhou</a>, <a href="/search/cs?searchtype=author&query=You%2C+L">Lianzhong You</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+H">Hongbo Xu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qianning Wang</a>, <a href="/search/cs?searchtype=author&query=Lian%2C+Z">Zheng Lian</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+F+R">Fei Richard Yu</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+F">Fei Ma</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+L">Laizhong Cui</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.04424v1-abstract-short" style="display: inline;"> With the integration of Multimodal large language models (MLLMs) into robotic systems and various AI applications, embedding emotional intelligence (EI) capabilities into these models is essential for enabling robots to effectively address human emotional needs and interact seamlessly in real-world scenarios. Existing static, text-based, or text-image benchmarks overlook the multimodal complexitie… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04424v1-abstract-full').style.display = 'inline'; document.getElementById('2502.04424v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.04424v1-abstract-full" style="display: none;"> With the integration of Multimodal large language models (MLLMs) into robotic systems and various AI applications, embedding emotional intelligence (EI) capabilities into these models is essential for enabling robots to effectively address human emotional needs and interact seamlessly in real-world scenarios. Existing static, text-based, or text-image benchmarks overlook the multimodal complexities of real-world interactions and fail to capture the dynamic, multimodal nature of emotional expressions, making them inadequate for evaluating MLLMs' EI. Based on established psychological theories of EI, we build EmoBench-M, a novel benchmark designed to evaluate the EI capability of MLLMs across 13 valuation scenarios from three key dimensions: foundational emotion recognition, conversational emotion understanding, and socially complex emotion analysis. Evaluations of both open-source and closed-source MLLMs on EmoBench-M reveal a significant performance gap between them and humans, highlighting the need to further advance their EI capabilities. All benchmark resources, including code and datasets, are publicly available at https://emo-gml.github.io/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04424v1-abstract-full').style.display = 'none'; document.getElementById('2502.04424v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.03821">arXiv:2502.03821</a> <span> [<a href="https://arxiv.org/pdf/2502.03821">pdf</a>, <a href="https://arxiv.org/format/2502.03821">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> PsyPlay: Personality-Infused Role-Playing Conversational Agents </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+T">Tao Yang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yuhua Zhu</a>, <a href="/search/cs?searchtype=author&query=Quan%2C+X">Xiaojun Quan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+C">Cong Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qifan Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.03821v1-abstract-short" style="display: inline;"> The current research on Role-Playing Conversational Agents (RPCAs) with Large Language Models (LLMs) primarily focuses on imitating specific speaking styles and utilizing character backgrounds, neglecting the depiction of deeper personality traits.~In this study, we introduce personality-infused role-playing for LLM agents, which encourages agents to accurately portray their designated personality… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03821v1-abstract-full').style.display = 'inline'; document.getElementById('2502.03821v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.03821v1-abstract-full" style="display: none;"> The current research on Role-Playing Conversational Agents (RPCAs) with Large Language Models (LLMs) primarily focuses on imitating specific speaking styles and utilizing character backgrounds, neglecting the depiction of deeper personality traits.~In this study, we introduce personality-infused role-playing for LLM agents, which encourages agents to accurately portray their designated personality traits during dialogues. We then propose PsyPlay, a dialogue generation framework that facilitates the expression of rich personalities among multiple LLM agents. Specifically, PsyPlay enables agents to assume roles with distinct personality traits and engage in discussions centered around specific topics, consistently exhibiting their designated personality traits throughout the interactions. Validation on generated dialogue data demonstrates that PsyPlay can accurately portray the intended personality traits, achieving an overall success rate of 80.31% on GPT-3.5. Notably, we observe that LLMs aligned with positive values are more successful in portraying positive personality roles compared to negative ones. Moreover, we construct a dialogue corpus for personality-infused role-playing, called PsyPlay-Bench. The corpus, which consists of 4745 instances of correctly portrayed dialogues using PsyPlay, aims to further facilitate research in personalized role-playing and dialogue personality detection. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03821v1-abstract-full').style.display = 'none'; document.getElementById('2502.03821v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.03498">arXiv:2502.03498</a> <span> [<a href="https://arxiv.org/pdf/2502.03498">pdf</a>, <a href="https://arxiv.org/format/2502.03498">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> Controllable Satellite-to-Street-View Synthesis with Precise Pose Alignment and Zero-Shot Environmental Control </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ze%2C+X">Xianghui Ze</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Z">Zhenbo Song</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qiwei Wang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+J">Jianfeng Lu</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yujiao Shi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.03498v1-abstract-short" style="display: inline;"> Generating street-view images from satellite imagery is a challenging task, particularly in maintaining accurate pose alignment and incorporating diverse environmental conditions. While diffusion models have shown promise in generative tasks, their ability to maintain strict pose alignment throughout the diffusion process is limited. In this paper, we propose a novel Iterative Homography Adjustmen… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03498v1-abstract-full').style.display = 'inline'; document.getElementById('2502.03498v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.03498v1-abstract-full" style="display: none;"> Generating street-view images from satellite imagery is a challenging task, particularly in maintaining accurate pose alignment and incorporating diverse environmental conditions. While diffusion models have shown promise in generative tasks, their ability to maintain strict pose alignment throughout the diffusion process is limited. In this paper, we propose a novel Iterative Homography Adjustment (IHA) scheme applied during the denoising process, which effectively addresses pose misalignment and ensures spatial consistency in the generated street-view images. Additionally, currently, available datasets for satellite-to-street-view generation are limited in their diversity of illumination and weather conditions, thereby restricting the generalizability of the generated outputs. To mitigate this, we introduce a text-guided illumination and weather-controlled sampling strategy that enables fine-grained control over the environmental factors. Extensive quantitative and qualitative evaluations demonstrate that our approach significantly improves pose accuracy and enhances the diversity and realism of generated street-view images, setting a new benchmark for satellite-to-street-view generation tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03498v1-abstract-full').style.display = 'none'; document.getElementById('2502.03498v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.03317">arXiv:2502.03317</a> <span> [<a href="https://arxiv.org/pdf/2502.03317">pdf</a>, <a href="https://arxiv.org/format/2502.03317">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Contact-Aware Motion Planning Among Movable Objects </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haokun Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qianhao Wang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+F">Fei Gao</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+S">Shaojie Shen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.03317v1-abstract-short" style="display: inline;"> Most existing methods for motion planning of mobile robots involve generating collision-free trajectories. However, these methods focusing solely on contact avoidance may limit the robots' locomotion and can not be applied to tasks where contact is inevitable or intentional. To address these issues, we propose a novel contact-aware motion planning (CAMP) paradigm for robotic systems. Our approach… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03317v1-abstract-full').style.display = 'inline'; document.getElementById('2502.03317v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.03317v1-abstract-full" style="display: none;"> Most existing methods for motion planning of mobile robots involve generating collision-free trajectories. However, these methods focusing solely on contact avoidance may limit the robots' locomotion and can not be applied to tasks where contact is inevitable or intentional. To address these issues, we propose a novel contact-aware motion planning (CAMP) paradigm for robotic systems. Our approach incorporates contact between robots and movable objects as complementarity constraints in optimization-based trajectory planning. By leveraging augmented Lagrangian methods (ALMs), we efficiently solve the optimization problem with complementarity constraints, producing spatial-temporal optimal trajectories of the robots. Simulations demonstrate that, compared to the state-of-the-art method, our proposed CAMP method expands the reachable space of mobile robots, resulting in a significant improvement in the success rate of two types of fundamental tasks: navigation among movable objects (NAMO) and rearrangement of movable objects (RAMO). Real-world experiments show that the trajectories generated by our proposed method are feasible and quickly deployed in different tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03317v1-abstract-full').style.display = 'none'; document.getElementById('2502.03317v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.02723">arXiv:2502.02723</a> <span> [<a href="https://arxiv.org/pdf/2502.02723">pdf</a>, <a href="https://arxiv.org/format/2502.02723">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Dobi-SVD: Differentiable SVD for LLM Compression and Some New Perspectives </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qinsi Wang</a>, <a href="/search/cs?searchtype=author&query=Ke%2C+J">Jinghan Ke</a>, <a href="/search/cs?searchtype=author&query=Tomizuka%2C+M">Masayoshi Tomizuka</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yiran Chen</a>, <a href="/search/cs?searchtype=author&query=Keutzer%2C+K">Kurt Keutzer</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+C">Chenfeng Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.02723v1-abstract-short" style="display: inline;"> We provide a new LLM-compression solution via SVD, unlocking new possibilities for LLM compression beyond quantization and pruning. We point out that the optimal use of SVD lies in truncating activations, rather than merely using activations as an optimization distance. Building on this principle, we address three critical challenges in SVD-based LLM compression: including (1) How can we determine… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02723v1-abstract-full').style.display = 'inline'; document.getElementById('2502.02723v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.02723v1-abstract-full" style="display: none;"> We provide a new LLM-compression solution via SVD, unlocking new possibilities for LLM compression beyond quantization and pruning. We point out that the optimal use of SVD lies in truncating activations, rather than merely using activations as an optimization distance. Building on this principle, we address three critical challenges in SVD-based LLM compression: including (1) How can we determine the optimal activation truncation position for each weight matrix in LLMs? (2) How can we efficiently reconstruct the weight matrices based on truncated activations? (3) How can we address the inherent "injection" nature that results in the information loss of the SVD? We propose Dobi-SVD, which establishes a new, principled approach to SVD-based LLM compression. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02723v1-abstract-full').style.display = 'none'; document.getElementById('2502.02723v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.02588">arXiv:2502.02588</a> <span> [<a href="https://arxiv.org/pdf/2502.02588">pdf</a>, <a href="https://arxiv.org/format/2502.02588">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Calibrated Multi-Preference Optimization for Aligning Diffusion Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lee%2C+K">Kyungmin Lee</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiaohang Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qifei Wang</a>, <a href="/search/cs?searchtype=author&query=He%2C+J">Junfeng He</a>, <a href="/search/cs?searchtype=author&query=Ke%2C+J">Junjie Ke</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+M">Ming-Hsuan Yang</a>, <a href="/search/cs?searchtype=author&query=Essa%2C+I">Irfan Essa</a>, <a href="/search/cs?searchtype=author&query=Shin%2C+J">Jinwoo Shin</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+F">Feng Yang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yinxiao Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.02588v1-abstract-short" style="display: inline;"> Aligning text-to-image (T2I) diffusion models with preference optimization is valuable for human-annotated datasets, but the heavy cost of manual data collection limits scalability. Using reward models offers an alternative, however, current preference optimization methods fall short in exploiting the rich information, as they only consider pairwise preference distribution. Furthermore, they lack… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02588v1-abstract-full').style.display = 'inline'; document.getElementById('2502.02588v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.02588v1-abstract-full" style="display: none;"> Aligning text-to-image (T2I) diffusion models with preference optimization is valuable for human-annotated datasets, but the heavy cost of manual data collection limits scalability. Using reward models offers an alternative, however, current preference optimization methods fall short in exploiting the rich information, as they only consider pairwise preference distribution. Furthermore, they lack generalization to multi-preference scenarios and struggle to handle inconsistencies between rewards. To address this, we present Calibrated Preference Optimization (CaPO), a novel method to align T2I diffusion models by incorporating the general preference from multiple reward models without human annotated data. The core of our approach involves a reward calibration method to approximate the general preference by computing the expected win-rate against the samples generated by the pretrained models. Additionally, we propose a frontier-based pair selection method that effectively manages the multi-preference distribution by selecting pairs from Pareto frontiers. Finally, we use regression loss to fine-tune diffusion models to match the difference between calibrated rewards of a selected pair. Experimental results show that CaPO consistently outperforms prior methods, such as Direct Preference Optimization (DPO), in both single and multi-reward settings validated by evaluation on T2I benchmarks, including GenEval and T2I-Compbench. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02588v1-abstract-full').style.display = 'none'; document.getElementById('2502.02588v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.02295">arXiv:2502.02295</a> <span> [<a href="https://arxiv.org/pdf/2502.02295">pdf</a>, <a href="https://arxiv.org/ps/2502.02295">ps</a>, <a href="https://arxiv.org/format/2502.02295">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> </div> </div> <p class="title is-5 mathjax"> Intelligent Reflecting Surface Based Localization of Mixed Near-Field and Far-Field Targets </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+W">Weifeng Zhu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qipeng Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shuowen Zhang</a>, <a href="/search/cs?searchtype=author&query=Di%2C+B">Boya Di</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Liang Liu</a>, <a href="/search/cs?searchtype=author&query=Eldar%2C+Y+C">Yonina C. Eldar</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.02295v1-abstract-short" style="display: inline;"> This paper considers an intelligent reflecting surface (IRS)-assisted bi-static localization architecture for the sixth-generation (6G) integrated sensing and communication (ISAC) network. The system consists of a transmit user, a receive base station (BS), an IRS, and multiple targets in either the far-field or near-field region of the IRS. In particular, we focus on the challenging scenario wher… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02295v1-abstract-full').style.display = 'inline'; document.getElementById('2502.02295v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.02295v1-abstract-full" style="display: none;"> This paper considers an intelligent reflecting surface (IRS)-assisted bi-static localization architecture for the sixth-generation (6G) integrated sensing and communication (ISAC) network. The system consists of a transmit user, a receive base station (BS), an IRS, and multiple targets in either the far-field or near-field region of the IRS. In particular, we focus on the challenging scenario where the line-of-sight (LOS) paths between targets and the BS are blocked, such that the emitted orthogonal frequency division multiplexing (OFDM) signals from the user reach the BS merely via the user-target-IRS-BS path. Based on the signals received by the BS, our goal is to localize the targets by estimating their relative positions to the IRS, instead of to the BS. We show that subspace-based methods, such as the multiple signal classification (MUSIC) algorithm, can be applied onto the BS's received signals to estimate the relative states from the targets to the IRS. To this end, we create a virtual signal via combining user-target-IRS-BS channels over various time slots. By applying MUSIC on such a virtual signal, we are able to detect the far-field targets and the near-field targets, and estimate the angle-of-arrivals (AOAs) and/or ranges from the targets to the IRS. Furthermore, we theoretically verify that the proposed method can perfectly estimate the relative states from the targets to the IRS in the ideal case with infinite coherence blocks. Numerical results verify the effectiveness of our proposed IRS-assisted localization scheme. Our paper demonstrates the potential of employing passive anchors, i.e., IRSs, to improve the sensing coverage of the active anchors, i.e., BSs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02295v1-abstract-full').style.display = 'none'; document.getElementById('2502.02295v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.02061">arXiv:2502.02061</a> <span> [<a href="https://arxiv.org/pdf/2502.02061">pdf</a>, <a href="https://arxiv.org/format/2502.02061">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Reason4Rec: Large Language Models for Recommendation with Deliberative User Preference Alignment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fang%2C+Y">Yi Fang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wenjie Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yang Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+F">Fengbin Zhu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qifan Wang</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+F">Fuli Feng</a>, <a href="/search/cs?searchtype=author&query=He%2C+X">Xiangnan He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.02061v2-abstract-short" style="display: inline;"> While recent advancements in aligning Large Language Models (LLMs) with recommendation tasks have shown great potential and promising performance overall, these aligned recommendation LLMs still face challenges in complex scenarios. This is primarily due to the current alignment approach focusing on optimizing LLMs to generate user feedback directly, without incorporating deliberation. To overcome… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02061v2-abstract-full').style.display = 'inline'; document.getElementById('2502.02061v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.02061v2-abstract-full" style="display: none;"> While recent advancements in aligning Large Language Models (LLMs) with recommendation tasks have shown great potential and promising performance overall, these aligned recommendation LLMs still face challenges in complex scenarios. This is primarily due to the current alignment approach focusing on optimizing LLMs to generate user feedback directly, without incorporating deliberation. To overcome this limitation and develop more reliable LLMs for recommendations, we propose a new Deliberative Recommendation task, which incorporates explicit reasoning about user preferences as an additional alignment goal. We then introduce the Reasoning-powered Recommender framework for deliberative user preference alignment, designed to enhance reasoning capabilities by utilizing verbalized user feedback in a step-wise manner to tackle this task. The framework employs collaborative step-wise experts and tailored training strategies for each expert. Experimental results across three real-world datasets demonstrate the rationality of the deliberative task formulation and the superior performance of the proposed framework in improving both prediction accuracy and reasoning quality. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02061v2-abstract-full').style.display = 'none'; document.getElementById('2502.02061v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.01836">arXiv:2502.01836</a> <span> [<a href="https://arxiv.org/pdf/2502.01836">pdf</a>, <a href="https://arxiv.org/format/2502.01836">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3709701">10.1145/3709701 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> LeaFi: Data Series Indexes on Steroids with Learned Filters </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qitong Wang</a>, <a href="/search/cs?searchtype=author&query=Ileana%2C+I">Ioana Ileana</a>, <a href="/search/cs?searchtype=author&query=Palpanas%2C+T">Themis Palpanas</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.01836v1-abstract-short" style="display: inline;"> The ever-growing collections of data series create a pressing need for efficient similarity search, which serves as the backbone for various analytics pipelines. Recent studies have shown that tree-based series indexes excel in many scenarios. However, we observe a significant waste of effort during search, due to suboptimal pruning. To address this issue, we introduce LeaFi, a novel framework tha… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01836v1-abstract-full').style.display = 'inline'; document.getElementById('2502.01836v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.01836v1-abstract-full" style="display: none;"> The ever-growing collections of data series create a pressing need for efficient similarity search, which serves as the backbone for various analytics pipelines. Recent studies have shown that tree-based series indexes excel in many scenarios. However, we observe a significant waste of effort during search, due to suboptimal pruning. To address this issue, we introduce LeaFi, a novel framework that uses machine learning models to boost pruning effectiveness of tree-based data series indexes. These models act as learned filters, which predict tight node-wise distance lower bounds that are used to make pruning decisions, thus, improving pruning effectiveness. We describe the LeaFi-enhanced index building algorithm, which selects leaf nodes and generates training data to insert and train machine learning models, as well as the LeaFi-enhanced search algorithm, which calibrates learned filters at query time to support the user-defined quality target of each query. Our experimental evaluation, using two different tree-based series indexes and five diverse datasets, demonstrates the advantages of the proposed approach. LeaFi-enhanced data-series indexes improve pruning ratio by up to 20x and search time by up to 32x, while maintaining a target recall of 99%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01836v1-abstract-full').style.display = 'none'; document.getElementById('2502.01836v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper was published in Proc. ACM Manag. Data, Vol. 3, No. N1 (SIGMOD), Article 51. Publication date: February 2025</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Proc. ACM Manag. Data 3, N1 (SIGMOD), Article 51 (February 2025), 24 pages </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.00734">arXiv:2502.00734</a> <span> [<a href="https://arxiv.org/pdf/2502.00734">pdf</a>, <a href="https://arxiv.org/format/2502.00734">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> CycleGuardian: A Framework for Automatic RespiratorySound classification Based on Improved Deep clustering and Contrastive Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chu%2C+Y">Yun Chu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qiuhao Wang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+E">Enze Zhou</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+L">Ling Fu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qian Liu</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+G">Gang Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.00734v1-abstract-short" style="display: inline;"> Auscultation plays a pivotal role in early respiratory and pulmonary disease diagnosis. Despite the emergence of deep learning-based methods for automatic respiratory sound classification post-Covid-19, limited datasets impede performance enhancement. Distinguishing between normal and abnormal respiratory sounds poses challenges due to the coexistence of normal respiratory components and noise com… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00734v1-abstract-full').style.display = 'inline'; document.getElementById('2502.00734v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.00734v1-abstract-full" style="display: none;"> Auscultation plays a pivotal role in early respiratory and pulmonary disease diagnosis. Despite the emergence of deep learning-based methods for automatic respiratory sound classification post-Covid-19, limited datasets impede performance enhancement. Distinguishing between normal and abnormal respiratory sounds poses challenges due to the coexistence of normal respiratory components and noise components in both types. Moreover, different abnormal respiratory sounds exhibit similar anomalous features, hindering their differentiation. Besides, existing state-of-the-art models suffer from excessive parameter size, impeding deployment on resource-constrained mobile platforms. To address these issues, we design a lightweight network CycleGuardian and propose a framework based on an improved deep clustering and contrastive learning. We first generate a hybrid spectrogram for feature diversity and grouping spectrograms to facilitating intermittent abnormal sound capture.Then, CycleGuardian integrates a deep clustering module with a similarity-constrained clustering component to improve the ability to capture abnormal features and a contrastive learning module with group mixing for enhanced abnormal feature discernment. Multi-objective optimization enhances overall performance during training. In experiments we use the ICBHI2017 dataset, following the official split method and without any pre-trained weights, our method achieves Sp: 82.06 $\%$, Se: 44.47$\%$, and Score: 63.26$\%$ with a network model size of 38M, comparing to the current model, our method leads by nearly 7$\%$, achieving the current best performances. Additionally, we deploy the network on Android devices, showcasing a comprehensive intelligent respiratory sound auscultation system. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00734v1-abstract-full').style.display = 'none'; document.getElementById('2502.00734v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.00700">arXiv:2502.00700</a> <span> [<a href="https://arxiv.org/pdf/2502.00700">pdf</a>, <a href="https://arxiv.org/format/2502.00700">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> S2CFormer: Reorienting Learned Image Compression from Spatial Interaction to Channel Aggregation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yunuo Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Q">Qian Li</a>, <a href="/search/cs?searchtype=author&query=He%2C+B">Bing He</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+D">Donghui Feng</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+R">Ronghua Wu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qi Wang</a>, <a href="/search/cs?searchtype=author&query=Song%2C+L">Li Song</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+G">Guo Lu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wenjun Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.00700v2-abstract-short" style="display: inline;"> Transformers have achieved significant success in learned image compression (LIC), with Swin Transformers emerging as the mainstream choice for nonlinear transforms. A common belief is that their sophisticated spatial operations contribute most to their efficacy. However, the crucial role of the feed-forward network (FFN) based Channel Aggregation module within the transformer architecture has bee… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00700v2-abstract-full').style.display = 'inline'; document.getElementById('2502.00700v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.00700v2-abstract-full" style="display: none;"> Transformers have achieved significant success in learned image compression (LIC), with Swin Transformers emerging as the mainstream choice for nonlinear transforms. A common belief is that their sophisticated spatial operations contribute most to their efficacy. However, the crucial role of the feed-forward network (FFN) based Channel Aggregation module within the transformer architecture has been largely overlooked, and the over-design of spatial operations leads to a suboptimal trade-off between decoding latency and R-D performance. In this paper, we reevaluate the key factors behind the competence of transformers in LIC. By replacing spatial operations with identity mapping, we are surprised to find that channel operations alone can approach the R-D performance of the leading methods. This solid lower bound of performance emphasizes that the presence of channel aggregation is more essential for the LIC model to achieve competitive performance, while the previously complex spatial interactions are partly redundant. Based on this insight, we initiate the "S2CFormer" paradigm, a general architecture that reorients the focus of LIC from Spatial Interaction to Channel Aggregation. We present two instantiations of the S2CFormer: S2C-Conv, and S2C-Attention. Each one incorporates a simple operator for spatial interaction and serves as nonlinear transform blocks for our LIC models. Both models demonstrate state-of-the-art (SOTA) R-D performance and significantly faster decoding speed. These results also motivate further exploration of advanced FFN structures to enhance the R-D performance while maintaining model efficiency. With these foundations, we introduce S2C-Hybrid, an enhanced LIC model that combines the strengths of different S2CFormer instantiations. This model outperforms all the existing methods on several datasets, setting a new benchmark for efficient and high-performance LIC. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00700v2-abstract-full').style.display = 'none'; document.getElementById('2502.00700v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.18653">arXiv:2501.18653</a> <span> [<a href="https://arxiv.org/pdf/2501.18653">pdf</a>, <a href="https://arxiv.org/format/2501.18653">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Cogito, ergo sum: A Neurobiologically-Inspired Cognition-Memory-Growth System for Code Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yanlong Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jindong Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qi Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+M">Menglin Yang</a>, <a href="/search/cs?searchtype=author&query=Kong%2C+H">He Kong</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shengsheng Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.18653v1-abstract-short" style="display: inline;"> Large language models based Multi Agent Systems (MAS) have demonstrated promising performance for enhancing the efficiency and accuracy of code generation tasks. However,most existing methods follow a conventional sequence of planning, coding, and debugging,which contradicts the growth-driven nature of human learning process. Additionally,the frequent information interaction between multiple agent… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.18653v1-abstract-full').style.display = 'inline'; document.getElementById('2501.18653v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.18653v1-abstract-full" style="display: none;"> Large language models based Multi Agent Systems (MAS) have demonstrated promising performance for enhancing the efficiency and accuracy of code generation tasks. However,most existing methods follow a conventional sequence of planning, coding, and debugging,which contradicts the growth-driven nature of human learning process. Additionally,the frequent information interaction between multiple agents inevitably involves high computational costs. In this paper,we propose Cogito,a neurobiologically inspired multi-agent framework to enhance the problem-solving capabilities in code generation tasks with lower cost. Specifically,Cogito adopts a reverse sequence: it first undergoes debugging, then coding,and finally planning. This approach mimics human learning and development,where knowledge is acquired progressively. Accordingly,a hippocampus-like memory module with different functions is designed to work with the pipeline to provide quick retrieval in similar tasks. Through this growth-based learning model,Cogito accumulates knowledge and cognitive skills at each stage,ultimately forming a Super Role an all capable agent to perform the code generation task. Extensive experiments against representative baselines demonstrate the superior performance and efficiency of Cogito. The code is publicly available at https://anonymous.4open.science/r/Cogito-0083. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.18653v1-abstract-full').style.display = 'none'; document.getElementById('2501.18653v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.18457">arXiv:2501.18457</a> <span> [<a href="https://arxiv.org/pdf/2501.18457">pdf</a>, <a href="https://arxiv.org/format/2501.18457">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> CALM: Unleashing the Cross-Lingual Self-Aligning Ability of Language Model Question Answering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yumeng Wang</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+Z">Zhiyuan Fan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qingyun Wang</a>, <a href="/search/cs?searchtype=author&query=Fung%2C+M">May Fung</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+H">Heng Ji</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.18457v2-abstract-short" style="display: inline;"> Large Language Models (LLMs) are pretrained on extensive multilingual corpora to acquire both language-specific cultural knowledge and general knowledge. Ideally, while LLMs should provide consistent responses to culture-independent questions across languages, we observe significant performance disparities. To address this, we explore the Cross-Lingual Self-Aligning ability of Language Models (CAL… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.18457v2-abstract-full').style.display = 'inline'; document.getElementById('2501.18457v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.18457v2-abstract-full" style="display: none;"> Large Language Models (LLMs) are pretrained on extensive multilingual corpora to acquire both language-specific cultural knowledge and general knowledge. Ideally, while LLMs should provide consistent responses to culture-independent questions across languages, we observe significant performance disparities. To address this, we explore the Cross-Lingual Self-Aligning ability of Language Models (CALM) to align knowledge across languages. Specifically, for a given question, we sample multiple responses across different languages and select the most self-consistent response as the target, leaving the remaining responses as negative examples. We then employ direct preference optimization (DPO) to align the model's knowledge across different languages. Evaluations on the MEDQA and X-CSQA datasets demonstrate CALM's effectiveness in enhancing cross-lingual knowledge question answering, both in zero-shot and retrieval-augmented settings. We also found that increasing the number of languages involved in CALM training leads to higher accuracy and consistency. We offer a qualitative analysis of how cross-lingual consistency can enhance knowledge alignment and explore the method's generalizability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.18457v2-abstract-full').style.display = 'none'; document.getElementById('2501.18457v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by NAACL 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.18158">arXiv:2501.18158</a> <span> [<a href="https://arxiv.org/pdf/2501.18158">pdf</a>, <a href="https://arxiv.org/format/2501.18158">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Large Language Models for Cryptocurrency Transaction Analysis: A Bitcoin Case Study </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lei%2C+Y">Yuchen Lei</a>, <a href="/search/cs?searchtype=author&query=Xiang%2C+Y">Yuexin Xiang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qin Wang</a>, <a href="/search/cs?searchtype=author&query=Dowsley%2C+R">Rafael Dowsley</a>, <a href="/search/cs?searchtype=author&query=Yuen%2C+T+H">Tsz Hon Yuen</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+J">Jiangshan Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.18158v2-abstract-short" style="display: inline;"> Cryptocurrencies are widely used, yet current methods for analyzing transactions heavily rely on opaque, black-box models. These lack interpretability and adaptability, failing to effectively capture behavioral patterns. Many researchers, including us, believe that Large Language Models (LLMs) could bridge this gap due to their robust reasoning abilities for complex tasks. In this paper, we test t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.18158v2-abstract-full').style.display = 'inline'; document.getElementById('2501.18158v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.18158v2-abstract-full" style="display: none;"> Cryptocurrencies are widely used, yet current methods for analyzing transactions heavily rely on opaque, black-box models. These lack interpretability and adaptability, failing to effectively capture behavioral patterns. Many researchers, including us, believe that Large Language Models (LLMs) could bridge this gap due to their robust reasoning abilities for complex tasks. In this paper, we test this hypothesis by applying LLMs to real-world cryptocurrency transaction graphs, specifically within the Bitcoin network. We introduce a three-tiered framework to assess LLM capabilities: foundational metrics, characteristic overview, and contextual interpretation. This includes a new, human-readable graph representation format, LLM4TG, and a connectivity-enhanced sampling algorithm, CETraS, which simplifies larger transaction graphs. Experimental results show that LLMs excel at foundational metrics and offer detailed characteristic overviews. Their effectiveness in contextual interpretation suggests they can provide useful explanations of transaction behaviors, even with limited labeled data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.18158v2-abstract-full').style.display = 'none'; document.getElementById('2501.18158v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.17379">arXiv:2501.17379</a> <span> [<a href="https://arxiv.org/pdf/2501.17379">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Data Structures and Algorithms">cs.DS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> </div> </div> <p class="title is-5 mathjax"> Stable Tree Labelling for Accelerating Distance Queries on Dynamic Road Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Koehler%2C+H">Henning Koehler</a>, <a href="/search/cs?searchtype=author&query=Farhan%2C+M">Muhammad Farhan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qing Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.17379v1-abstract-short" style="display: inline;"> Finding the shortest-path distance between two arbitrary vertices is an important problem in road networks. Due to real-time traffic conditions, road networks undergo dynamic changes all the time. Current state-of-the-art methods incrementally maintain a distance labelling based on a hierarchy among vertices to support efficient distance computation. However, their labelling sizes are often large… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.17379v1-abstract-full').style.display = 'inline'; document.getElementById('2501.17379v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.17379v1-abstract-full" style="display: none;"> Finding the shortest-path distance between two arbitrary vertices is an important problem in road networks. Due to real-time traffic conditions, road networks undergo dynamic changes all the time. Current state-of-the-art methods incrementally maintain a distance labelling based on a hierarchy among vertices to support efficient distance computation. However, their labelling sizes are often large and cannot be efficiently maintained. To combat these issues, we present a simple yet efficient labelling method, namely \emph{Stable Tree Labelling} (STL), for answering distance queries on dynamic road networks. We observe that the properties of an underlying hierarchy play an important role in improving and balancing query and update performance. Thus, we introduce the notion of \emph{stable tree hierarchy} which lays the ground for developing efficient maintenance algorithms on dynamic road networks. Based on stable tree hierarchy, STL can be efficiently constructed as a 2-hop labelling. A crucial ingredient of STL is to only store distances within subgraphs in labels, rather than distances in the entire graph, which restricts the labels affected by dynamic changes. We further develop two efficient maintenance algorithms upon STL: \emph{Label Search algorithm} and \emph{Pareto Search algorithm}. Label Search algorithm identifies affected ancestors in a stable tree hierarchy and performs efficient searches to update labels from those ancestors. Pareto Search algorithm explores the interaction between search spaces of different ancestors, and combines searches from multiple ancestors into only two searches for each update, eliminating duplicate graph traversals. The experiments show that our algorithms significantly outperform state-of-the-art dynamic methods in maintaining the labelling and query processing, while requiring an order of magnitude less space. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.17379v1-abstract-full').style.display = 'none'; document.getElementById('2501.17379v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.16945">arXiv:2501.16945</a> <span> [<a href="https://arxiv.org/pdf/2501.16945">pdf</a>, <a href="https://arxiv.org/format/2501.16945">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> ToolFactory: Automating Tool Generation by Leveraging LLM to Understand REST API Documentations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ni%2C+X">Xinyi Ni</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qiuyang Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yukun Zhang</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+P">Pengyu Hong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.16945v1-abstract-short" style="display: inline;"> LLM-based tool agents offer natural language interfaces, enabling users to seamlessly interact with computing services. While REST APIs are valuable resources for building such agents, they must first be transformed into AI-compatible tools. Automatically generating AI-compatible tools from REST API documents can greatly streamline tool agent development and minimize user learning curves. However,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16945v1-abstract-full').style.display = 'inline'; document.getElementById('2501.16945v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.16945v1-abstract-full" style="display: none;"> LLM-based tool agents offer natural language interfaces, enabling users to seamlessly interact with computing services. While REST APIs are valuable resources for building such agents, they must first be transformed into AI-compatible tools. Automatically generating AI-compatible tools from REST API documents can greatly streamline tool agent development and minimize user learning curves. However, API documentation often suffers from a lack of standardization, inconsistent schemas, and incomplete information. To address these issues, we developed \textbf{ToolFactory}, an open-source pipeline for automating tool generation from unstructured API documents. To enhance the reliability of the developed tools, we implemented an evaluation method to diagnose errors. Furthermore, we built a knowledge base of verified tools, which we leveraged to infer missing information from poorly documented APIs. We developed the API Extraction Benchmark, comprising 167 API documents and 744 endpoints in various formats, and designed a JSON schema to annotate them. This annotated dataset was utilized to train and validate ToolFactory. The experimental results highlight the effectiveness of ToolFactory. We also demonstrated ToolFactory by creating a domain-specific AI agent for glycomaterials research. ToolFactory exhibits significant potential for facilitating the seamless integration of scientific REST APIs into AI workflows. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16945v1-abstract-full').style.display = 'none'; document.getElementById('2501.16945v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.16404">arXiv:2501.16404</a> <span> [<a href="https://arxiv.org/pdf/2501.16404">pdf</a>, <a href="https://arxiv.org/format/2501.16404">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> DynaPrompt: Dynamic Test-Time Prompt Tuning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiao%2C+Z">Zehao Xiao</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+S">Shilin Yan</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+J">Jack Hong</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+J">Jiayin Cai</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+X">Xiaolong Jiang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yao Hu</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+J">Jiayi Shen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qi Wang</a>, <a href="/search/cs?searchtype=author&query=Snoek%2C+C+G+M">Cees G. M. Snoek</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.16404v1-abstract-short" style="display: inline;"> Test-time prompt tuning enhances zero-shot generalization of vision-language models but tends to ignore the relatedness among test samples during inference. Online test-time prompt tuning provides a simple way to leverage the information in previous test samples, albeit with the risk of prompt collapse due to error accumulation. To enhance test-time prompt tuning, we propose DynaPrompt, short for… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16404v1-abstract-full').style.display = 'inline'; document.getElementById('2501.16404v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.16404v1-abstract-full" style="display: none;"> Test-time prompt tuning enhances zero-shot generalization of vision-language models but tends to ignore the relatedness among test samples during inference. Online test-time prompt tuning provides a simple way to leverage the information in previous test samples, albeit with the risk of prompt collapse due to error accumulation. To enhance test-time prompt tuning, we propose DynaPrompt, short for dynamic test-time prompt tuning, exploiting relevant data distribution information while reducing error accumulation. Built on an online prompt buffer, DynaPrompt adaptively selects and optimizes the relevant prompts for each test sample during tuning. Specifically, we introduce a dynamic prompt selection strategy based on two metrics: prediction entropy and probability difference. For unseen test data information, we develop dynamic prompt appending, which allows the buffer to append new prompts and delete the inactive ones. By doing so, the prompts are optimized to exploit beneficial information on specific test data, while alleviating error accumulation. Experiments on fourteen datasets demonstrate the effectiveness of dynamic test-time prompt tuning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16404v1-abstract-full').style.display = 'none'; document.getElementById('2501.16404v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICLR 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.16165">arXiv:2501.16165</a> <span> [<a href="https://arxiv.org/pdf/2501.16165">pdf</a>, <a href="https://arxiv.org/format/2501.16165">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Operating Systems">cs.OS</span> </div> </div> <p class="title is-5 mathjax"> Demystifying OS Kernel Fuzzing with a Novel Taxonomy </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jiacheng Xu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+H">He Sun</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+S">Shihao Jiang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qinying Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Mingming Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiang Li</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+K">Kaiwen Shen</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+P">Peng Cheng</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jiming Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Charles Zhang</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+S">Shouling Ji</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.16165v1-abstract-short" style="display: inline;"> The Operating System (OS) kernel is foundational in modern computing, especially with the proliferation of diverse computing devices. However, its development also comes with vulnerabilities that can lead to severe security breaches. Kernel fuzzing, a technique used to uncover these vulnerabilities, poses distinct challenges when compared to userspace fuzzing. These include the complexity of confi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16165v1-abstract-full').style.display = 'inline'; document.getElementById('2501.16165v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.16165v1-abstract-full" style="display: none;"> The Operating System (OS) kernel is foundational in modern computing, especially with the proliferation of diverse computing devices. However, its development also comes with vulnerabilities that can lead to severe security breaches. Kernel fuzzing, a technique used to uncover these vulnerabilities, poses distinct challenges when compared to userspace fuzzing. These include the complexity of configuring the testing environment and addressing the statefulness inherent to both the kernel and the fuzzing process. Despite the significant interest from the security community, a comprehensive understanding of kernel fuzzing remains lacking, hindering further progress in the field. In this paper, we present the first systematic study dedicated to OS kernel fuzzing. It begins by summarizing the progress of 99 academic studies from top-tier venues between 2017 and 2024. Following this, we introduce a stage-based fuzzing model and a novel fuzzing taxonomy that highlights nine core functionalities unique to kernel fuzzing. These functionalities are examined alongside their corresponding methodological approaches based on qualitative evaluation criteria. Our systematization identifies challenges in meeting functionality requirements and proposes potential technical solutions. Finally, we outline promising and practical future directions to guide forthcoming research in kernel security, supported in part by insights derived from our case study. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16165v1-abstract-full').style.display = 'none'; document.getElementById('2501.16165v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.15987">arXiv:2501.15987</a> <span> [<a href="https://arxiv.org/pdf/2501.15987">pdf</a>, <a href="https://arxiv.org/format/2501.15987">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Numerical Analysis">math.NA</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> MultiPDENet: PDE-embedded Learning with Multi-time-stepping for Accelerated Flow Simulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qi Wang</a>, <a href="/search/cs?searchtype=author&query=Mi%2C+Y">Yuan Mi</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haoyun Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yi Zhang</a>, <a href="/search/cs?searchtype=author&query=Chengze%2C+R">Ruizhi Chengze</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Hongsheng Liu</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+J">Ji-Rong Wen</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+H">Hao Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.15987v1-abstract-short" style="display: inline;"> Solving partial differential equations (PDEs) by numerical methods meet computational cost challenge for getting the accurate solution since fine grids and small time steps are required. Machine learning can accelerate this process, but struggle with weak generalizability, interpretability, and data dependency, as well as suffer in long-term prediction. To this end, we propose a PDE-embedded netwo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15987v1-abstract-full').style.display = 'inline'; document.getElementById('2501.15987v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.15987v1-abstract-full" style="display: none;"> Solving partial differential equations (PDEs) by numerical methods meet computational cost challenge for getting the accurate solution since fine grids and small time steps are required. Machine learning can accelerate this process, but struggle with weak generalizability, interpretability, and data dependency, as well as suffer in long-term prediction. To this end, we propose a PDE-embedded network with multiscale time stepping (MultiPDENet), which fuses the scheme of numerical methods and machine learning, for accelerated simulation of flows. In particular, we design a convolutional filter based on the structure of finite difference stencils with a small number of parameters to optimize, which estimates the equivalent form of spatial derivative on a coarse grid to minimize the equation's residual. A Physics Block with a 4th-order Runge-Kutta integrator at the fine time scale is established that embeds the structure of PDEs to guide the prediction. To alleviate the curse of temporal error accumulation in long-term prediction, we introduce a multiscale time integration approach, where a neural network is used to correct the prediction error at a coarse time scale. Experiments across various PDE systems, including the Navier-Stokes equations, demonstrate that MultiPDENet can accurately predict long-term spatiotemporal dynamics, even given small and incomplete training data, e.g., spatiotemporally down-sampled datasets. MultiPDENet achieves the state-of-the-art performance compared with other neural baseline models, also with clear speedup compared to classical numerical methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15987v1-abstract-full').style.display = 'none'; document.getElementById('2501.15987v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.15770">arXiv:2501.15770</a> <span> [<a href="https://arxiv.org/pdf/2501.15770">pdf</a>, <a href="https://arxiv.org/format/2501.15770">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Walk in Their Shoes to Navigate Your Own Path: Learning About Procrastination Through A Serious Game </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Runhua Zhang</a>, <a href="/search/cs?searchtype=author&query=Gan%2C+J">Jiaqi Gan</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+S">Shangyuan Gao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Siyi Chen</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+X">Xinyu Wu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+D">Dong Chen</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+Y">Yulin Tian</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qi Wang</a>, <a href="/search/cs?searchtype=author&query=An%2C+P">Pengcheng An</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.15770v1-abstract-short" style="display: inline;"> Procrastination, the voluntary delay of tasks despite potential negative consequences, has prompted numerous time and task management interventions in the HCI community. While these interventions have shown promise in addressing specific behaviors, psychological theories suggest that learning about procrastination itself may help individuals develop their own coping strategies and build mental res… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15770v1-abstract-full').style.display = 'inline'; document.getElementById('2501.15770v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.15770v1-abstract-full" style="display: none;"> Procrastination, the voluntary delay of tasks despite potential negative consequences, has prompted numerous time and task management interventions in the HCI community. While these interventions have shown promise in addressing specific behaviors, psychological theories suggest that learning about procrastination itself may help individuals develop their own coping strategies and build mental resilience. However, little research has explored how to support this learning process through HCI approaches. We present ProcrastiMate, a text adventure game where players learn about procrastination's causes and experiment with coping strategies by guiding in-game characters in managing relatable scenarios. Our field study with 27 participants revealed that ProcrastiMate facilitated learning and self-reflection while maintaining psychological distance, motivating players to integrate newly acquired knowledge in daily life. This paper contributes empirical insights on leveraging serious games to facilitate learning about procrastination and offers design implications for addressing psychological challenges through HCI approaches. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15770v1-abstract-full').style.display = 'none'; document.getElementById('2501.15770v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.15588">arXiv:2501.15588</a> <span> [<a href="https://arxiv.org/pdf/2501.15588">pdf</a>, <a href="https://arxiv.org/format/2501.15588">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Tumor Detection, Segmentation and Classification Challenge on Automated 3D Breast Ultrasound: The TDSC-ABUS Challenge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Luo%2C+G">Gongning Luo</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+M">Mingwang Xu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hongyu Chen</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+X">Xinjie Liang</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+X">Xing Tao</a>, <a href="/search/cs?searchtype=author&query=Ni%2C+D">Dong Ni</a>, <a href="/search/cs?searchtype=author&query=Jeong%2C+H">Hyunsu Jeong</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+C">Chulhong Kim</a>, <a href="/search/cs?searchtype=author&query=Stock%2C+R">Raphael Stock</a>, <a href="/search/cs?searchtype=author&query=Baumgartner%2C+M">Michael Baumgartner</a>, <a href="/search/cs?searchtype=author&query=Kirchhoff%2C+Y">Yannick Kirchhoff</a>, <a href="/search/cs?searchtype=author&query=Rokuss%2C+M">Maximilian Rokuss</a>, <a href="/search/cs?searchtype=author&query=Maier-Hein%2C+K">Klaus Maier-Hein</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhikai Yang</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+T">Tianyu Fan</a>, <a href="/search/cs?searchtype=author&query=Boutry%2C+N">Nicolas Boutry</a>, <a href="/search/cs?searchtype=author&query=Tereshchenko%2C+D">Dmitry Tereshchenko</a>, <a href="/search/cs?searchtype=author&query=Moine%2C+A">Arthur Moine</a>, <a href="/search/cs?searchtype=author&query=Charmetant%2C+M">Maximilien Charmetant</a>, <a href="/search/cs?searchtype=author&query=Sauer%2C+J">Jan Sauer</a>, <a href="/search/cs?searchtype=author&query=Du%2C+H">Hao Du</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+X">Xiang-Hui Bai</a>, <a href="/search/cs?searchtype=author&query=Raikar%2C+V+P">Vipul Pai Raikar</a>, <a href="/search/cs?searchtype=author&query=Montoya-del-Angel%2C+R">Ricardo Montoya-del-Angel</a>, <a href="/search/cs?searchtype=author&query=Marti%2C+R">Robert Marti</a> , et al. (12 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.15588v1-abstract-short" style="display: inline;"> Breast cancer is one of the most common causes of death among women worldwide. Early detection helps in reducing the number of deaths. Automated 3D Breast Ultrasound (ABUS) is a newer approach for breast screening, which has many advantages over handheld mammography such as safety, speed, and higher detection rate of breast cancer. Tumor detection, segmentation, and classification are key componen… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15588v1-abstract-full').style.display = 'inline'; document.getElementById('2501.15588v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.15588v1-abstract-full" style="display: none;"> Breast cancer is one of the most common causes of death among women worldwide. Early detection helps in reducing the number of deaths. Automated 3D Breast Ultrasound (ABUS) is a newer approach for breast screening, which has many advantages over handheld mammography such as safety, speed, and higher detection rate of breast cancer. Tumor detection, segmentation, and classification are key components in the analysis of medical images, especially challenging in the context of 3D ABUS due to the significant variability in tumor size and shape, unclear tumor boundaries, and a low signal-to-noise ratio. The lack of publicly accessible, well-labeled ABUS datasets further hinders the advancement of systems for breast tumor analysis. Addressing this gap, we have organized the inaugural Tumor Detection, Segmentation, and Classification Challenge on Automated 3D Breast Ultrasound 2023 (TDSC-ABUS2023). This initiative aims to spearhead research in this field and create a definitive benchmark for tasks associated with 3D ABUS image analysis. In this paper, we summarize the top-performing algorithms from the challenge and provide critical analysis for ABUS image examination. We offer the TDSC-ABUS challenge as an open-access platform at https://tdsc-abus2023.grand-challenge.org/ to benchmark and inspire future developments in algorithmic research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15588v1-abstract-full').style.display = 'none'; document.getElementById('2501.15588v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.15077">arXiv:2501.15077</a> <span> [<a href="https://arxiv.org/pdf/2501.15077">pdf</a>, <a href="https://arxiv.org/format/2501.15077">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> </div> </div> <p class="title is-5 mathjax"> NetChain: Authenticated Blockchain Top-k Graph Data Queries and its Application in Asset Management </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Hongguang Zhao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xu Yang</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+S">Saiyu Qi</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qiuhao Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+K">Ke Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.15077v1-abstract-short" style="display: inline;"> As a valuable digital resource, graph data is an important data asset, which has been widely utilized across various fields to optimize decision-making and enable smarter solutions. To manage data assets, blockchain is widely used to enable data sharing and trading, but it cannot supply complex analytical queries. vChain was proposed to achieve verifiable boolean queries over blockchain by designi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15077v1-abstract-full').style.display = 'inline'; document.getElementById('2501.15077v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.15077v1-abstract-full" style="display: none;"> As a valuable digital resource, graph data is an important data asset, which has been widely utilized across various fields to optimize decision-making and enable smarter solutions. To manage data assets, blockchain is widely used to enable data sharing and trading, but it cannot supply complex analytical queries. vChain was proposed to achieve verifiable boolean queries over blockchain by designing an embedded authenticated data structure (ADS). However, for generating (non-)existence proofs, vChain suffers from expensive storage and computation costs in ADS construction, along with high communication and verification costs. In this paper, we propose a novel NetChain framework that enables efficient top-k queries over on-chain graph data with verifiability. Specifically, we design a novel authenticated two-layer index that supports (non-)existence proof generation in block-level and built-in verifiability for matched objects. To further alleviate the computation and verification overhead, an optimized variant NetChain+ is derived. The authenticity of our frameworks is validated through security analysis. Evaluations show that NetChain and NetChain+ outperform vChain, respectively achieving up to 85X and 31X improvements on ADS construction. Moreover, compared with vChain, NetChain+ reduces the communication and verification costs by 87% and 96% respectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15077v1-abstract-full').style.display = 'none'; document.getElementById('2501.15077v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.15034">arXiv:2501.15034</a> <span> [<a href="https://arxiv.org/pdf/2501.15034">pdf</a>, <a href="https://arxiv.org/format/2501.15034">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Divergence-Augmented Policy Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qing Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yingru Li</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+J">Jiechao Xiong</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tong Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.15034v1-abstract-short" style="display: inline;"> In deep reinforcement learning, policy optimization methods need to deal with issues such as function approximation and the reuse of off-policy data. Standard policy gradient methods do not handle off-policy data well, leading to premature convergence and instability. This paper introduces a method to stabilize policy optimization when off-policy data are reused. The idea is to include a Bregman d… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15034v1-abstract-full').style.display = 'inline'; document.getElementById('2501.15034v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.15034v1-abstract-full" style="display: none;"> In deep reinforcement learning, policy optimization methods need to deal with issues such as function approximation and the reuse of off-policy data. Standard policy gradient methods do not handle off-policy data well, leading to premature convergence and instability. This paper introduces a method to stabilize policy optimization when off-policy data are reused. The idea is to include a Bregman divergence between the behavior policy that generates the data and the current policy to ensure small and safe policy updates with off-policy data. The Bregman divergence is calculated between the state distributions of two policies, instead of only on the action probabilities, leading to a divergence augmentation formulation. Empirical experiments on Atari games show that in the data-scarce scenario where the reuse of off-policy data becomes necessary, our method can achieve better performance than other state-of-the-art deep reinforcement learning algorithms. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15034v1-abstract-full').style.display = 'none'; document.getElementById('2501.15034v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">33rd Conference on Neural Information Processing Systems (NeurIPS 2019), Vancouver, Canada</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Wang%2C+Q&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Wang%2C+Q&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Wang%2C+Q&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Wang%2C+Q&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Wang%2C+Q&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Wang%2C+Q&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository