Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 1,246 results for author: <span class="mathjax">Xu, M</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Xu%2C+M">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Xu, M"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Xu%2C+M&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Xu, M"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Xu%2C+M&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Xu%2C+M&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Xu%2C+M&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Xu%2C+M&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Xu%2C+M&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Xu%2C+M&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.08003">arXiv:2502.08003</a> <span> [<a href="https://arxiv.org/pdf/2502.08003">pdf</a>, <a href="https://arxiv.org/format/2502.08003">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Heterogeneous Multi-agent Multi-armed Bandits on Stochastic Block Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+M">Mengfan Xu</a>, <a href="/search/cs?searchtype=author&query=Shan%2C+L">Liren Shan</a>, <a href="/search/cs?searchtype=author&query=Ghaffari%2C+F">Fatemeh Ghaffari</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xuchuang Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xutong Liu</a>, <a href="/search/cs?searchtype=author&query=Hajiesmaili%2C+M">Mohammad Hajiesmaili</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.08003v1-abstract-short" style="display: inline;"> We study a novel heterogeneous multi-agent multi-armed bandit problem with a cluster structure induced by stochastic block models, influencing not only graph topology, but also reward heterogeneity. Specifically, agents are distributed on random graphs based on stochastic block models - a generalized Erdos-Renyi model with heterogeneous edge probabilities: agents are grouped into clusters (known o… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08003v1-abstract-full').style.display = 'inline'; document.getElementById('2502.08003v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.08003v1-abstract-full" style="display: none;"> We study a novel heterogeneous multi-agent multi-armed bandit problem with a cluster structure induced by stochastic block models, influencing not only graph topology, but also reward heterogeneity. Specifically, agents are distributed on random graphs based on stochastic block models - a generalized Erdos-Renyi model with heterogeneous edge probabilities: agents are grouped into clusters (known or unknown); edge probabilities for agents within the same cluster differ from those across clusters. In addition, the cluster structure in stochastic block model also determines our heterogeneous rewards. Rewards distributions of the same arm vary across agents in different clusters but remain consistent within a cluster, unifying homogeneous and heterogeneous settings and varying degree of heterogeneity, and rewards are independent samples from these distributions. The objective is to minimize system-wide regret across all agents. To address this, we propose a novel algorithm applicable to both known and unknown cluster settings. The algorithm combines an averaging-based consensus approach with a newly introduced information aggregation and weighting technique, resulting in a UCB-type strategy. It accounts for graph randomness, leverages both intra-cluster (homogeneous) and inter-cluster (heterogeneous) information from rewards and graphs, and incorporates cluster detection for unknown cluster settings. We derive optimal instance-dependent regret upper bounds of order $\log{T}$ under sub-Gaussian rewards. Importantly, our regret bounds capture the degree of heterogeneity in the system (an additional layer of complexity), exhibit smaller constants, scale better for large systems, and impose significantly relaxed assumptions on edge probabilities. In contrast, prior works have not accounted for this refined problem complexity, rely on more stringent assumptions, and exhibit limited scalability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08003v1-abstract-full').style.display = 'none'; document.getElementById('2502.08003v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">55 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07856">arXiv:2502.07856</a> <span> [<a href="https://arxiv.org/pdf/2502.07856">pdf</a>, <a href="https://arxiv.org/format/2502.07856">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> MRS: A Fast Sampler for Mean Reverting Diffusion based on ODE and SDE Solvers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+A">Ao Li</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+W">Wei Fang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Hongbo Zhao</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+L">Le Lu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+G">Ge Yang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+M">Minfeng Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07856v2-abstract-short" style="display: inline;"> In applications of diffusion models, controllable generation is of practical significance, but is also challenging. Current methods for controllable generation primarily focus on modifying the score function of diffusion models, while Mean Reverting (MR) Diffusion directly modifies the structure of the stochastic differential equation (SDE), making the incorporation of image conditions simpler and… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07856v2-abstract-full').style.display = 'inline'; document.getElementById('2502.07856v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07856v2-abstract-full" style="display: none;"> In applications of diffusion models, controllable generation is of practical significance, but is also challenging. Current methods for controllable generation primarily focus on modifying the score function of diffusion models, while Mean Reverting (MR) Diffusion directly modifies the structure of the stochastic differential equation (SDE), making the incorporation of image conditions simpler and more natural. However, current training-free fast samplers are not directly applicable to MR Diffusion. And thus MR Diffusion requires hundreds of NFEs (number of function evaluations) to obtain high-quality samples. In this paper, we propose a new algorithm named MRS (MR Sampler) to reduce the sampling NFEs of MR Diffusion. We solve the reverse-time SDE and the probability flow ordinary differential equation (PF-ODE) associated with MR Diffusion, and derive semi-analytical solutions. The solutions consist of an analytical function and an integral parameterized by a neural network. Based on this solution, we can generate high-quality samples in fewer steps. Our approach does not require training and supports all mainstream parameterizations, including noise prediction, data prediction and velocity prediction. Extensive experiments demonstrate that MR Sampler maintains high sampling quality with a speedup of 10 to 20 times across ten different image restoration tasks. Our algorithm accelerates the sampling procedure of MR Diffusion, making it more practical in controllable generation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07856v2-abstract-full').style.display = 'none'; document.getElementById('2502.07856v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICLR 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07365">arXiv:2502.07365</a> <span> [<a href="https://arxiv.org/pdf/2502.07365">pdf</a>, <a href="https://arxiv.org/format/2502.07365">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> LongReD: Mitigating Short-Text Degradation of Long-Context Large Language Models via Restoration Distillation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dong%2C+Z">Zican Dong</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Junyi Li</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+J">Jinhao Jiang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+M">Mingyu Xu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+W+X">Wayne Xin Zhao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bingning Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Weipeng Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07365v1-abstract-short" style="display: inline;"> Large language models (LLMs) have gained extended context windows through scaling positional encodings and lightweight continual pre-training. However, this often leads to degraded performance on short-text tasks, while the reasons for this degradation remain insufficiently explored. In this work, we identify two primary factors contributing to this issue: distribution drift in hidden states and a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07365v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07365v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07365v1-abstract-full" style="display: none;"> Large language models (LLMs) have gained extended context windows through scaling positional encodings and lightweight continual pre-training. However, this often leads to degraded performance on short-text tasks, while the reasons for this degradation remain insufficiently explored. In this work, we identify two primary factors contributing to this issue: distribution drift in hidden states and attention scores, and catastrophic forgetting during continual pre-training. To address these challenges, we propose Long Context Pre-training with Restoration Distillation (LongReD), a novel approach designed to mitigate short-text performance degradation through minimizing the distribution discrepancy between the extended and original models. Besides training on long texts, LongReD distills the hidden state of selected layers from the original model on short texts. Additionally, LongReD also introduces a short-to-long distillation, aligning the output distribution on short texts with that on long texts by leveraging skipped positional indices. Experiments on common text benchmarks demonstrate that LongReD effectively preserves the model's short-text performance while maintaining comparable or even better capacity to handle long texts than baselines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07365v1-abstract-full').style.display = 'none'; document.getElementById('2502.07365v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07319">arXiv:2502.07319</a> <span> [<a href="https://arxiv.org/pdf/2502.07319">pdf</a>, <a href="https://arxiv.org/ps/2502.07319">ps</a>, <a href="https://arxiv.org/format/2502.07319">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> </div> </div> <p class="title is-5 mathjax"> Learnable Residual-based Latent Denoising in Semantic Communication </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+M">Mingkai Xu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yongpeng Wu</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yuxuan Shi</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+X">Xiang-Gen Xia</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wenjun Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+P">Ping Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07319v1-abstract-short" style="display: inline;"> A latent denoising semantic communication (SemCom) framework is proposed for robust image transmission over noisy channels. By incorporating a learnable latent denoiser into the receiver, the received signals are preprocessed to effectively remove the channel noise and recover the semantic information, thereby enhancing the quality of the decoded images. Specifically, a latent denoising mapping is… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07319v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07319v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07319v1-abstract-full" style="display: none;"> A latent denoising semantic communication (SemCom) framework is proposed for robust image transmission over noisy channels. By incorporating a learnable latent denoiser into the receiver, the received signals are preprocessed to effectively remove the channel noise and recover the semantic information, thereby enhancing the quality of the decoded images. Specifically, a latent denoising mapping is established by an iterative residual learning approach to improve the denoising efficiency while ensuring stable performance. Moreover, channel signal-to-noise ratio (SNR) is utilized to estimate and predict the latent similarity score (SS) for conditional denoising, where the number of denoising steps is adapted based on the predicted SS sequence, further reducing the communication latency. Finally, simulations demonstrate that the proposed framework can effectively and efficiently remove the channel noise at various levels and reconstruct visual-appealing images. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07319v1-abstract-full').style.display = 'none'; document.getElementById('2502.07319v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper has been accepted by IEEE Wireless Communications Letters</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07276">arXiv:2502.07276</a> <span> [<a href="https://arxiv.org/pdf/2502.07276">pdf</a>, <a href="https://arxiv.org/format/2502.07276">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Dataset Ownership Verification in Contrastive Pre-trained Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xie%2C+Y">Yuechen Xie</a>, <a href="/search/cs?searchtype=author&query=Song%2C+J">Jie Song</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+M">Mengqi Xue</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Haofei Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xingen Wang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+B">Bingde Hu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+G">Genlang Chen</a>, <a href="/search/cs?searchtype=author&query=Song%2C+M">Mingli Song</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07276v1-abstract-short" style="display: inline;"> High-quality open-source datasets, which necessitate substantial efforts for curation, has become the primary catalyst for the swift progress of deep learning. Concurrently, protecting these datasets is paramount for the well-being of the data owner. Dataset ownership verification emerges as a crucial method in this domain, but existing approaches are often limited to supervised models and cannot… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07276v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07276v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07276v1-abstract-full" style="display: none;"> High-quality open-source datasets, which necessitate substantial efforts for curation, has become the primary catalyst for the swift progress of deep learning. Concurrently, protecting these datasets is paramount for the well-being of the data owner. Dataset ownership verification emerges as a crucial method in this domain, but existing approaches are often limited to supervised models and cannot be directly extended to increasingly popular unsupervised pre-trained models. In this work, we propose the first dataset ownership verification method tailored specifically for self-supervised pre-trained models by contrastive learning. Its primary objective is to ascertain whether a suspicious black-box backbone has been pre-trained on a specific unlabeled dataset, aiding dataset owners in upholding their rights. The proposed approach is motivated by our empirical insights that when models are trained with the target dataset, the unary and binary instance relationships within the embedding space exhibit significant variations compared to models trained without the target dataset. We validate the efficacy of this approach across multiple contrastive pre-trained models including SimCLR, BYOL, SimSiam, MOCO v3, and DINO. The results demonstrate that our method rejects the null hypothesis with a $p$-value markedly below $0.05$, surpassing all previous methodologies. Our code is available at https://github.com/xieyc99/DOV4CL. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07276v1-abstract-full').style.display = 'none'; document.getElementById('2502.07276v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICLR2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06693">arXiv:2502.06693</a> <span> [<a href="https://arxiv.org/pdf/2502.06693">pdf</a>, <a href="https://arxiv.org/ps/2502.06693">ps</a>, <a href="https://arxiv.org/format/2502.06693">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> Recent Advances, Applications and Open Challenges in Machine Learning for Health: Reflections from Research Roundtables at ML4H 2024 Symposium </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Adibi%2C+A">Amin Adibi</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+X">Xu Cao</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+Z">Zongliang Ji</a>, <a href="/search/cs?searchtype=author&query=Kaur%2C+J+N">Jivat Neet Kaur</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Winston Chen</a>, <a href="/search/cs?searchtype=author&query=Healey%2C+E">Elizabeth Healey</a>, <a href="/search/cs?searchtype=author&query=Nuwagira%2C+B">Brighton Nuwagira</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+W">Wenqian Ye</a>, <a href="/search/cs?searchtype=author&query=Woollard%2C+G">Geoffrey Woollard</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+M+A">Maxwell A Xu</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+H">Hejie Cui</a>, <a href="/search/cs?searchtype=author&query=Xi%2C+J">Johnny Xi</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+T">Trenton Chang</a>, <a href="/search/cs?searchtype=author&query=Bikia%2C+V">Vasiliki Bikia</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+N">Nicole Zhang</a>, <a href="/search/cs?searchtype=author&query=Noori%2C+A">Ayush Noori</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+Y">Yuan Xia</a>, <a href="/search/cs?searchtype=author&query=Hossain%2C+M+B">Md. Belal Hossain</a>, <a href="/search/cs?searchtype=author&query=Frank%2C+H+A">Hanna A. Frank</a>, <a href="/search/cs?searchtype=author&query=Peluso%2C+A">Alina Peluso</a>, <a href="/search/cs?searchtype=author&query=Pu%2C+Y">Yuan Pu</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+S+Z">Shannon Zejiang Shen</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">John Wu</a>, <a href="/search/cs?searchtype=author&query=Fallahpour%2C+A">Adibvafa Fallahpour</a>, <a href="/search/cs?searchtype=author&query=Mahbub%2C+S">Sazan Mahbub</a> , et al. (17 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06693v1-abstract-short" style="display: inline;"> The fourth Machine Learning for Health (ML4H) symposium was held in person on December 15th and 16th, 2024, in the traditional, ancestral, and unceded territories of the Musqueam, Squamish, and Tsleil-Waututh Nations in Vancouver, British Columbia, Canada. The symposium included research roundtable sessions to foster discussions between participants and senior researchers on timely and relevant to… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06693v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06693v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06693v1-abstract-full" style="display: none;"> The fourth Machine Learning for Health (ML4H) symposium was held in person on December 15th and 16th, 2024, in the traditional, ancestral, and unceded territories of the Musqueam, Squamish, and Tsleil-Waututh Nations in Vancouver, British Columbia, Canada. The symposium included research roundtable sessions to foster discussions between participants and senior researchers on timely and relevant topics for the ML4H community. The organization of the research roundtables at the conference involved 13 senior and 27 junior chairs across 13 tables. Each roundtable session included an invited senior chair (with substantial experience in the field), junior chairs (responsible for facilitating the discussion), and attendees from diverse backgrounds with an interest in the session's topic. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06693v1-abstract-full').style.display = 'none'; document.getElementById('2502.06693v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05765">arXiv:2502.05765</a> <span> [<a href="https://arxiv.org/pdf/2502.05765">pdf</a>, <a href="https://arxiv.org/format/2502.05765">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> Privacy-Preserving Dataset Combination </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fuentes%2C+K">Keren Fuentes</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+M">Mimee Xu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+I">Irene Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05765v1-abstract-short" style="display: inline;"> Access to diverse, high-quality datasets is crucial for machine learning model performance, yet data sharing remains limited by privacy concerns and competitive interests, particularly in regulated domains like healthcare. This dynamic especially disadvantages smaller organizations that lack resources to purchase data or negotiate favorable sharing agreements. We present SecureKL, a privacy-preser… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05765v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05765v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05765v1-abstract-full" style="display: none;"> Access to diverse, high-quality datasets is crucial for machine learning model performance, yet data sharing remains limited by privacy concerns and competitive interests, particularly in regulated domains like healthcare. This dynamic especially disadvantages smaller organizations that lack resources to purchase data or negotiate favorable sharing agreements. We present SecureKL, a privacy-preserving framework that enables organizations to identify beneficial data partnerships without exposing sensitive information. Building on recent advances in dataset combination methods, we develop a secure multiparty computation protocol that maintains strong privacy guarantees while achieving >90\% correlation with plaintext evaluations. In experiments with real-world hospital data, SecureKL successfully identifies beneficial data partnerships that improve model performance for intensive care unit mortality prediction while preserving data privacy. Our framework provides a practical solution for organizations seeking to leverage collective data resources while maintaining privacy and competitive advantages. These results demonstrate the potential for privacy-preserving data collaboration to advance machine learning applications in high-stakes domains while promoting more equitable access to data resources. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05765v1-abstract-full').style.display = 'none'; document.getElementById('2502.05765v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">14 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05713">arXiv:2502.05713</a> <span> [<a href="https://arxiv.org/pdf/2502.05713">pdf</a>, <a href="https://arxiv.org/format/2502.05713">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> 4D VQ-GAN: Synthesising Medical Scans at Any Time Point for Personalised Disease Progression Modelling of Idiopathic Pulmonary Fibrosis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+A">An Zhao</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+M">Moucheng Xu</a>, <a href="/search/cs?searchtype=author&query=Shahin%2C+A+H">Ahmed H. Shahin</a>, <a href="/search/cs?searchtype=author&query=Wuyts%2C+W">Wim Wuyts</a>, <a href="/search/cs?searchtype=author&query=Jones%2C+M+G">Mark G. Jones</a>, <a href="/search/cs?searchtype=author&query=Jacob%2C+J">Joseph Jacob</a>, <a href="/search/cs?searchtype=author&query=Alexander%2C+D+C">Daniel C. Alexander</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05713v1-abstract-short" style="display: inline;"> Understanding the progression trajectories of diseases is crucial for early diagnosis and effective treatment planning. This is especially vital for life-threatening conditions such as Idiopathic Pulmonary Fibrosis (IPF), a chronic, progressive lung disease with a prognosis comparable to many cancers. Computed tomography (CT) imaging has been established as a reliable diagnostic tool for IPF. Accu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05713v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05713v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05713v1-abstract-full" style="display: none;"> Understanding the progression trajectories of diseases is crucial for early diagnosis and effective treatment planning. This is especially vital for life-threatening conditions such as Idiopathic Pulmonary Fibrosis (IPF), a chronic, progressive lung disease with a prognosis comparable to many cancers. Computed tomography (CT) imaging has been established as a reliable diagnostic tool for IPF. Accurately predicting future CT scans of early-stage IPF patients can aid in developing better treatment strategies, thereby improving survival outcomes. In this paper, we propose 4D Vector Quantised Generative Adversarial Networks (4D-VQ-GAN), a model capable of generating realistic CT volumes of IPF patients at any time point. The model is trained using a two-stage approach. In the first stage, a 3D-VQ-GAN is trained to reconstruct CT volumes. In the second stage, a Neural Ordinary Differential Equation (ODE) based temporal model is trained to capture the temporal dynamics of the quantised embeddings generated by the encoder in the first stage. We evaluate different configurations of our model for generating longitudinal CT scans and compare the results against ground truth data, both quantitatively and qualitatively. For validation, we conduct survival analysis using imaging biomarkers derived from generated CT scans and achieve a C-index comparable to that of biomarkers derived from the real CT scans. The survival analysis results demonstrate the potential clinical utility inherent to generated longitudinal CT scans, showing that they can reliably predict survival outcomes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05713v1-abstract-full').style.display = 'none'; document.getElementById('2502.05713v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">4D image synthesis, VQ-GAN, neural ODEs, spatial temporal disease progression modelling, CT, IPF</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.04747">arXiv:2502.04747</a> <span> [<a href="https://arxiv.org/pdf/2502.04747">pdf</a>, <a href="https://arxiv.org/format/2502.04747">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Every Software as an Agent: Blueprint and Case Study </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+M">Mengwei Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.04747v1-abstract-short" style="display: inline;"> The rise of (multimodal) large language models (LLMs) has shed light on software agent -- where software can understand and follow user instructions in natural language. However, existing approaches such as API-based and GUI-based agents are far from satisfactory at accuracy and efficiency aspects. Instead, we advocate to endow LLMs with access to the software internals (source code and runtime co… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04747v1-abstract-full').style.display = 'inline'; document.getElementById('2502.04747v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.04747v1-abstract-full" style="display: none;"> The rise of (multimodal) large language models (LLMs) has shed light on software agent -- where software can understand and follow user instructions in natural language. However, existing approaches such as API-based and GUI-based agents are far from satisfactory at accuracy and efficiency aspects. Instead, we advocate to endow LLMs with access to the software internals (source code and runtime context) and the permission to dynamically inject generated code into software for execution. In such a whitebox setting, one may better leverage the software context and the coding ability of LLMs. We then present an overall design architecture and case studies on two popular web-based desktop applications. We also give in-depth discussion of the challenges and future directions. We deem that such a new paradigm has the potential to fundamentally overturn the existing software agent design, and finally creating a digital world in which software can comprehend, operate, collaborate, and even think to meet complex user needs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04747v1-abstract-full').style.display = 'none'; document.getElementById('2502.04747v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.01108">arXiv:2502.01108</a> <span> [<a href="https://arxiv.org/pdf/2502.01108">pdf</a>, <a href="https://arxiv.org/format/2502.01108">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Pulse-PPG: An Open-Source Field-Trained PPG Foundation Model for Wearable Applications Across Lab and Field Settings </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Saha%2C+M">Mithun Saha</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+M+A">Maxwell A. Xu</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+W">Wanting Mao</a>, <a href="/search/cs?searchtype=author&query=Neupane%2C+S">Sameer Neupane</a>, <a href="/search/cs?searchtype=author&query=Rehg%2C+J+M">James M. Rehg</a>, <a href="/search/cs?searchtype=author&query=Kumar%2C+S">Santosh Kumar</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.01108v1-abstract-short" style="display: inline;"> Photoplethysmography (PPG)-based foundation models are gaining traction due to the widespread use of PPG in biosignal monitoring and their potential to generalize across diverse health applications. In this paper, we introduce Pulse-PPG, the first open-source PPG foundation model trained exclusively on raw PPG data collected over a 100-day field study with 120 participants. Existing PPG foundation… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01108v1-abstract-full').style.display = 'inline'; document.getElementById('2502.01108v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.01108v1-abstract-full" style="display: none;"> Photoplethysmography (PPG)-based foundation models are gaining traction due to the widespread use of PPG in biosignal monitoring and their potential to generalize across diverse health applications. In this paper, we introduce Pulse-PPG, the first open-source PPG foundation model trained exclusively on raw PPG data collected over a 100-day field study with 120 participants. Existing PPG foundation models are either open-source but trained on clinical data or closed-source, limiting their applicability in real-world settings. We evaluate Pulse-PPG across multiple datasets and downstream tasks, comparing its performance against a state-of-the-art foundation model trained on clinical data. Our results demonstrate that Pulse-PPG, trained on uncurated field data, exhibits superior generalization across clinical and mobile health applications in both lab and field settings. This suggests that exposure to real-world variability enables the model to learn fine-grained representations, making it more adaptable across tasks. Furthermore, pre-training on field data surprisingly outperforms its pre-training on clinical data in many tasks, reinforcing the importance of training on real-world, diverse datasets. To encourage further advancements in robust foundation models leveraging field data, we plan to release Pulse-PPG, providing researchers with a powerful resource for developing more generalizable PPG-based models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01108v1-abstract-full').style.display = 'none'; document.getElementById('2502.01108v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The first two listed authors contributed equally to this research</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.00858">arXiv:2502.00858</a> <span> [<a href="https://arxiv.org/pdf/2502.00858">pdf</a>, <a href="https://arxiv.org/format/2502.00858">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Learning to Plan with Personalized Preferences </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+M">Manjie Xu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xinyi Yang</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+W">Wei Liang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chi Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yixin Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.00858v1-abstract-short" style="display: inline;"> Effective integration of AI agents into daily life requires them to understand and adapt to individual human preferences, particularly in collaborative roles. Although recent studies on embodied intelligence have advanced significantly, they typically adopt generalized approaches that overlook personal preferences in planning. We address this limitation by developing agents that not only learn pre… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00858v1-abstract-full').style.display = 'inline'; document.getElementById('2502.00858v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.00858v1-abstract-full" style="display: none;"> Effective integration of AI agents into daily life requires them to understand and adapt to individual human preferences, particularly in collaborative roles. Although recent studies on embodied intelligence have advanced significantly, they typically adopt generalized approaches that overlook personal preferences in planning. We address this limitation by developing agents that not only learn preferences from few demonstrations but also learn to adapt their planning strategies based on these preferences. Our research leverages the observation that preferences, though implicitly expressed through minimal demonstrations, can generalize across diverse planning scenarios. To systematically evaluate this hypothesis, we introduce Preference-based Planning (PbP) benchmark, an embodied benchmark featuring hundreds of diverse preferences spanning from atomic actions to complex sequences. Our evaluation of SOTA methods reveals that while symbol-based approaches show promise in scalability, significant challenges remain in learning to generate and execute plans that satisfy personalized preferences. We further demonstrate that incorporating learned preferences as intermediate representations in planning significantly improves the agent's ability to construct personalized plans. These findings establish preferences as a valuable abstraction layer for adaptive planning, opening new directions for research in preference-guided plan generation and execution. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00858v1-abstract-full').style.display = 'none'; document.getElementById('2502.00858v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.19239">arXiv:2501.19239</a> <span> [<a href="https://arxiv.org/pdf/2501.19239">pdf</a>, <a href="https://arxiv.org/ps/2501.19239">ps</a>, <a href="https://arxiv.org/format/2501.19239">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Multi-agent Multi-armed Bandit with Fully Heavy-tailed Dynamics </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xingyu Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+M">Mengfan Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.19239v1-abstract-short" style="display: inline;"> We study decentralized multi-agent multi-armed bandits in fully heavy-tailed settings, where clients communicate over sparse random graphs with heavy-tailed degree distributions and observe heavy-tailed (homogeneous or heterogeneous) reward distributions with potentially infinite variance. The objective is to maximize system performance by pulling the globally optimal arm with the highest global r… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.19239v1-abstract-full').style.display = 'inline'; document.getElementById('2501.19239v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.19239v1-abstract-full" style="display: none;"> We study decentralized multi-agent multi-armed bandits in fully heavy-tailed settings, where clients communicate over sparse random graphs with heavy-tailed degree distributions and observe heavy-tailed (homogeneous or heterogeneous) reward distributions with potentially infinite variance. The objective is to maximize system performance by pulling the globally optimal arm with the highest global reward mean across all clients. We are the first to address such fully heavy-tailed scenarios, which capture the dynamics and challenges in communication and inference among multiple clients in real-world systems. In homogeneous settings, our algorithmic framework exploits hub-like structures unique to heavy-tailed graphs, allowing clients to aggregate rewards and reduce noises via hub estimators when constructing UCB indices; under $M$ clients and degree distributions with power-law index $伪> 1$, our algorithm attains a regret bound (almost) of order $O(M^{1 -\frac{1}伪} \log{T})$. Under heterogeneous rewards, clients synchronize by communicating with neighbors, aggregating exchanged estimators in UCB indices; With our newly established information delay bounds on sparse random graphs, we prove a regret bound of $O(M \log{T})$. Our results improve upon existing work, which only address time-invariant connected graphs, or light-tailed dynamics in dense graphs and rewards. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.19239v1-abstract-full').style.display = 'none'; document.getElementById('2501.19239v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">40 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.17667">arXiv:2501.17667</a> <span> [<a href="https://arxiv.org/pdf/2501.17667">pdf</a>, <a href="https://arxiv.org/format/2501.17667">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> CAMP in the Odyssey: Provably Robust Reinforcement Learning with Certified Radius Maximization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+D">Derui Wang</a>, <a href="/search/cs?searchtype=author&query=Moore%2C+K">Kristen Moore</a>, <a href="/search/cs?searchtype=author&query=Goel%2C+D">Diksha Goel</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+M">Minjune Kim</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Gang Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yang Li</a>, <a href="/search/cs?searchtype=author&query=Doss%2C+R">Robin Doss</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+M">Minhui Xue</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Bo Li</a>, <a href="/search/cs?searchtype=author&query=Camtepe%2C+S">Seyit Camtepe</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+L">Liming Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.17667v1-abstract-short" style="display: inline;"> Deep reinforcement learning (DRL) has gained widespread adoption in control and decision-making tasks due to its strong performance in dynamic environments. However, DRL agents are vulnerable to noisy observations and adversarial attacks, and concerns about the adversarial robustness of DRL systems have emerged. Recent efforts have focused on addressing these robustness issues by establishing rigo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.17667v1-abstract-full').style.display = 'inline'; document.getElementById('2501.17667v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.17667v1-abstract-full" style="display: none;"> Deep reinforcement learning (DRL) has gained widespread adoption in control and decision-making tasks due to its strong performance in dynamic environments. However, DRL agents are vulnerable to noisy observations and adversarial attacks, and concerns about the adversarial robustness of DRL systems have emerged. Recent efforts have focused on addressing these robustness issues by establishing rigorous theoretical guarantees for the returns achieved by DRL agents in adversarial settings. Among these approaches, policy smoothing has proven to be an effective and scalable method for certifying the robustness of DRL agents. Nevertheless, existing certifiably robust DRL relies on policies trained with simple Gaussian augmentations, resulting in a suboptimal trade-off between certified robustness and certified return. To address this issue, we introduce a novel paradigm dubbed \texttt{C}ertified-r\texttt{A}dius-\texttt{M}aximizing \texttt{P}olicy (\texttt{CAMP}) training. \texttt{CAMP} is designed to enhance DRL policies, achieving better utility without compromising provable robustness. By leveraging the insight that the global certified radius can be derived from local certified radii based on training-time statistics, \texttt{CAMP} formulates a surrogate loss related to the local certified radius and optimizes the policy guided by this surrogate loss. We also introduce \textit{policy imitation} as a novel technique to stabilize \texttt{CAMP} training. Experimental results demonstrate that \texttt{CAMP} significantly improves the robustness-return trade-off across various tasks. Based on the results, \texttt{CAMP} can achieve up to twice the certified expected return compared to that of baselines. Our code is available at https://github.com/NeuralSec/camp-robust-rl. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.17667v1-abstract-full').style.display = 'none'; document.getElementById('2501.17667v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to USENIX Security Symposium 2025, Seattle, WA, USA. Source code is available at Github (https://github.com/NeuralSec/camp-robust-rl) and Zenodo (https://zenodo.org/records/14729675)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.16843">arXiv:2501.16843</a> <span> [<a href="https://arxiv.org/pdf/2501.16843">pdf</a>, <a href="https://arxiv.org/format/2501.16843">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Bones of Contention: Exploring Query-Efficient Attacks Against Skeleton Recognition Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cao%2C+Y">Yuxin Cao</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+K">Kai Ye</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+D">Derui Wang</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+M">Minhui Xue</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+H">Hao Ge</a>, <a href="/search/cs?searchtype=author&query=Qian%2C+C">Chenxiong Qian</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+J+S">Jin Song Dong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.16843v1-abstract-short" style="display: inline;"> Skeleton action recognition models have secured more attention than video-based ones in various applications due to privacy preservation and lower storage requirements. Skeleton data are typically transmitted to cloud servers for action recognition, with results returned to clients via Apps/APIs. However, the vulnerability of skeletal models against adversarial perturbations gradually reveals the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16843v1-abstract-full').style.display = 'inline'; document.getElementById('2501.16843v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.16843v1-abstract-full" style="display: none;"> Skeleton action recognition models have secured more attention than video-based ones in various applications due to privacy preservation and lower storage requirements. Skeleton data are typically transmitted to cloud servers for action recognition, with results returned to clients via Apps/APIs. However, the vulnerability of skeletal models against adversarial perturbations gradually reveals the unreliability of these systems. Existing black-box attacks all operate in a decision-based manner, resulting in numerous queries that hinder efficiency and feasibility in real-world applications. Moreover, all attacks off the shelf focus on only restricted perturbations, while ignoring model weaknesses when encountered with non-semantic perturbations. In this paper, we propose two query-effIcient Skeletal Adversarial AttaCks, ISAAC-K and ISAAC-N. As a black-box attack, ISAAC-K utilizes Grad-CAM in a surrogate model to extract key joints where minor sparse perturbations are then added to fool the classifier. To guarantee natural adversarial motions, we introduce constraints of both bone length and temporal consistency. ISAAC-K finds stronger adversarial examples on $\ell_\infty$ norm, which can encompass those on other norms. Exhaustive experiments substantiate that ISAAC-K can uplift the attack efficiency of the perturbations under 10 skeletal models. Additionally, as a byproduct, ISAAC-N fools the classifier by replacing skeletons unrelated to the action. We surprisingly find that skeletal models are vulnerable to large perturbations where the part-wise non-semantic joints are just replaced, leading to a query-free no-box attack without any prior knowledge. Based on that, four adaptive defenses are eventually proposed to improve the robustness of skeleton recognition models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16843v1-abstract-full').style.display = 'none'; document.getElementById('2501.16843v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 13 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.15588">arXiv:2501.15588</a> <span> [<a href="https://arxiv.org/pdf/2501.15588">pdf</a>, <a href="https://arxiv.org/format/2501.15588">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Tumor Detection, Segmentation and Classification Challenge on Automated 3D Breast Ultrasound: The TDSC-ABUS Challenge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Luo%2C+G">Gongning Luo</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+M">Mingwang Xu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hongyu Chen</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+X">Xinjie Liang</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+X">Xing Tao</a>, <a href="/search/cs?searchtype=author&query=Ni%2C+D">Dong Ni</a>, <a href="/search/cs?searchtype=author&query=Jeong%2C+H">Hyunsu Jeong</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+C">Chulhong Kim</a>, <a href="/search/cs?searchtype=author&query=Stock%2C+R">Raphael Stock</a>, <a href="/search/cs?searchtype=author&query=Baumgartner%2C+M">Michael Baumgartner</a>, <a href="/search/cs?searchtype=author&query=Kirchhoff%2C+Y">Yannick Kirchhoff</a>, <a href="/search/cs?searchtype=author&query=Rokuss%2C+M">Maximilian Rokuss</a>, <a href="/search/cs?searchtype=author&query=Maier-Hein%2C+K">Klaus Maier-Hein</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhikai Yang</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+T">Tianyu Fan</a>, <a href="/search/cs?searchtype=author&query=Boutry%2C+N">Nicolas Boutry</a>, <a href="/search/cs?searchtype=author&query=Tereshchenko%2C+D">Dmitry Tereshchenko</a>, <a href="/search/cs?searchtype=author&query=Moine%2C+A">Arthur Moine</a>, <a href="/search/cs?searchtype=author&query=Charmetant%2C+M">Maximilien Charmetant</a>, <a href="/search/cs?searchtype=author&query=Sauer%2C+J">Jan Sauer</a>, <a href="/search/cs?searchtype=author&query=Du%2C+H">Hao Du</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+X">Xiang-Hui Bai</a>, <a href="/search/cs?searchtype=author&query=Raikar%2C+V+P">Vipul Pai Raikar</a>, <a href="/search/cs?searchtype=author&query=Montoya-del-Angel%2C+R">Ricardo Montoya-del-Angel</a>, <a href="/search/cs?searchtype=author&query=Marti%2C+R">Robert Marti</a> , et al. (12 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.15588v1-abstract-short" style="display: inline;"> Breast cancer is one of the most common causes of death among women worldwide. Early detection helps in reducing the number of deaths. Automated 3D Breast Ultrasound (ABUS) is a newer approach for breast screening, which has many advantages over handheld mammography such as safety, speed, and higher detection rate of breast cancer. Tumor detection, segmentation, and classification are key componen… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15588v1-abstract-full').style.display = 'inline'; document.getElementById('2501.15588v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.15588v1-abstract-full" style="display: none;"> Breast cancer is one of the most common causes of death among women worldwide. Early detection helps in reducing the number of deaths. Automated 3D Breast Ultrasound (ABUS) is a newer approach for breast screening, which has many advantages over handheld mammography such as safety, speed, and higher detection rate of breast cancer. Tumor detection, segmentation, and classification are key components in the analysis of medical images, especially challenging in the context of 3D ABUS due to the significant variability in tumor size and shape, unclear tumor boundaries, and a low signal-to-noise ratio. The lack of publicly accessible, well-labeled ABUS datasets further hinders the advancement of systems for breast tumor analysis. Addressing this gap, we have organized the inaugural Tumor Detection, Segmentation, and Classification Challenge on Automated 3D Breast Ultrasound 2023 (TDSC-ABUS2023). This initiative aims to spearhead research in this field and create a definitive benchmark for tasks associated with 3D ABUS image analysis. In this paper, we summarize the top-performing algorithms from the challenge and provide critical analysis for ABUS image examination. We offer the TDSC-ABUS challenge as an open-access platform at https://tdsc-abus2023.grand-challenge.org/ to benchmark and inspire future developments in algorithmic research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15588v1-abstract-full').style.display = 'none'; document.getElementById('2501.15588v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.15062">arXiv:2501.15062</a> <span> [<a href="https://arxiv.org/pdf/2501.15062">pdf</a>, <a href="https://arxiv.org/format/2501.15062">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Exact Fit Attention in Node-Holistic Graph Convolutional Network for Improved EEG-Based Driver Fatigue Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+M">Meiyan Xu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Q">Qingqing Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+D">Duo Chen</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+Y">Yi Ding</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jingyuan Wang</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+P">Peipei Gu</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+Y">Yijie Pan</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+D">Deshuang Huang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xun Zhang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+J">Jiayang Guo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.15062v1-abstract-short" style="display: inline;"> EEG-based fatigue monitoring can effectively reduce the incidence of related traffic accidents. In the past decade, with the advancement of deep learning, convolutional neural networks (CNN) have been increasingly used for EEG signal processing. However, due to the data's non-Euclidean characteristics, existing CNNs may lose important spatial information from EEG, specifically channel correlation.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15062v1-abstract-full').style.display = 'inline'; document.getElementById('2501.15062v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.15062v1-abstract-full" style="display: none;"> EEG-based fatigue monitoring can effectively reduce the incidence of related traffic accidents. In the past decade, with the advancement of deep learning, convolutional neural networks (CNN) have been increasingly used for EEG signal processing. However, due to the data's non-Euclidean characteristics, existing CNNs may lose important spatial information from EEG, specifically channel correlation. Thus, we propose the node-holistic graph convolutional network (NHGNet), a model that uses graphic convolution to dynamically learn each channel's features. With exact fit attention optimization, the network captures inter-channel correlations through a trainable adjacency matrix. The interpretability is enhanced by revealing critical areas of brain activity and their interrelations in various mental states. In validations on two public datasets, NHGNet outperforms the SOTAs. Specifically, in the intra-subject, NHGNet improved detection accuracy by at least 2.34% and 3.42%, and in the inter-subjects, it improved by at least 2.09% and 15.06%. Visualization research on the model revealed that the central parietal area plays an important role in detecting fatigue levels, whereas the frontal and temporal lobes are essential for maintaining vigilance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15062v1-abstract-full').style.display = 'none'; document.getElementById('2501.15062v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.14302">arXiv:2501.14302</a> <span> [<a href="https://arxiv.org/pdf/2501.14302">pdf</a>, <a href="https://arxiv.org/format/2501.14302">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> TD-RD: A Top-Down Benchmark with Real-Time Framework for Road Damage Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiao%2C+X">Xi Xiao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhengji Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wentao Wang</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+J">Jiacheng Xie</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H">Houjie Lin</a>, <a href="/search/cs?searchtype=author&query=Roy%2C+S+K">Swalpa Kumar Roy</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+T">Tianyang Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+M">Min Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.14302v1-abstract-short" style="display: inline;"> Object detection has witnessed remarkable advancements over the past decade, largely driven by breakthroughs in deep learning and the proliferation of large scale datasets. However, the domain of road damage detection remains relatively under explored, despite its critical significance for applications such as infrastructure maintenance and road safety. This paper addresses this gap by introducing… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.14302v1-abstract-full').style.display = 'inline'; document.getElementById('2501.14302v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.14302v1-abstract-full" style="display: none;"> Object detection has witnessed remarkable advancements over the past decade, largely driven by breakthroughs in deep learning and the proliferation of large scale datasets. However, the domain of road damage detection remains relatively under explored, despite its critical significance for applications such as infrastructure maintenance and road safety. This paper addresses this gap by introducing a novel top down benchmark that offers a complementary perspective to existing datasets, specifically tailored for road damage detection. Our proposed Top Down Road Damage Detection Dataset (TDRD) includes three primary categories of road damage cracks, potholes, and patches captured from a top down viewpoint. The dataset consists of 7,088 high resolution images, encompassing 12,882 annotated instances of road damage. Additionally, we present a novel real time object detection framework, TDYOLOV10, designed to handle the unique challenges posed by the TDRD dataset. Comparative studies with state of the art models demonstrate competitive baseline results. By releasing TDRD, we aim to accelerate research in this crucial area. A sample of the dataset will be made publicly available upon the paper's acceptance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.14302v1-abstract-full').style.display = 'none'; document.getElementById('2501.14302v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.14205">arXiv:2501.14205</a> <span> [<a href="https://arxiv.org/pdf/2501.14205">pdf</a>, <a href="https://arxiv.org/format/2501.14205">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> Serving Long-Context LLMs at the Mobile Edge: Test-Time Reinforcement Learning-based Model Caching and Inference Offloading </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+M">Minrui Xu</a>, <a href="/search/cs?searchtype=author&query=Niyato%2C+D">Dusit Niyato</a>, <a href="/search/cs?searchtype=author&query=Brinton%2C+C+G">Christopher G. Brinton</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.14205v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) can perform zero-shot learning on unseen tasks and few-shot learning on complex reasoning tasks. However, resource-limited mobile edge networks struggle to support long-context LLM serving for LLM agents during multi-round interactions with users. Unlike stateless computation offloading and static service offloading in edge computing, optimizing LLM serving at edge ser… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.14205v1-abstract-full').style.display = 'inline'; document.getElementById('2501.14205v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.14205v1-abstract-full" style="display: none;"> Large Language Models (LLMs) can perform zero-shot learning on unseen tasks and few-shot learning on complex reasoning tasks. However, resource-limited mobile edge networks struggle to support long-context LLM serving for LLM agents during multi-round interactions with users. Unlike stateless computation offloading and static service offloading in edge computing, optimizing LLM serving at edge servers is challenging because LLMs continuously learn from context which raises accuracy, latency, and resource consumption dynamics. In this paper, we propose a joint model caching and inference offloading framework that utilizes test-time deep reinforcement learning (T2DRL) to optimize deployment and execution strategies for long-context LLM serving. In this framework, we analyze the performance convergence and design an optimization problem considering the utilization of context windows in LLMs. Furthermore, the T2DRL algorithm can learn in both the training phase and the testing phase to proactively manage cached models and service requests and adapt to context changes and usage patterns during execution. To further enhance resource allocation efficiency, we propose a double Dutch auction (DDA) mechanism, which dynamically matches supply and demand while maximizing social welfare. Finally, experimental results demonstrate that the T2DRL algorithm can reduce system costs by at least 30% compared to baselines while guaranteeing the performance of LLM agents in real-world perception and reasoning tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.14205v1-abstract-full').style.display = 'none'; document.getElementById('2501.14205v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.13351">arXiv:2501.13351</a> <span> [<a href="https://arxiv.org/pdf/2501.13351">pdf</a>, <a href="https://arxiv.org/format/2501.13351">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3696410.3714593">10.1145/3696410.3714593 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> 50 Shades of Deceptive Patterns: A Unified Taxonomy, Multimodal Detection, and Security Implications </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shi%2C+Z">Zewei Shi</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+R">Ruoxi Sun</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jieshan Chen</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+J">Jiamou Sun</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+M">Minhui Xue</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yansong Gao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+F">Feng Liu</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+X">Xingliang Yuan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.13351v3-abstract-short" style="display: inline;"> Deceptive patterns (DPs) are user interface designs deliberately crafted to manipulate users into unintended decisions, often by exploiting cognitive biases for the benefit of companies or services. While numerous studies have explored ways to identify these deceptive patterns, many existing solutions require significant human intervention and struggle to keep pace with the evolving nature of dece… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13351v3-abstract-full').style.display = 'inline'; document.getElementById('2501.13351v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.13351v3-abstract-full" style="display: none;"> Deceptive patterns (DPs) are user interface designs deliberately crafted to manipulate users into unintended decisions, often by exploiting cognitive biases for the benefit of companies or services. While numerous studies have explored ways to identify these deceptive patterns, many existing solutions require significant human intervention and struggle to keep pace with the evolving nature of deceptive designs. To address these challenges, we expanded the deceptive pattern taxonomy from security and privacy perspectives, refining its categories and scope. We created a comprehensive dataset of deceptive patterns by integrating existing small-scale datasets with new samples, resulting in 6,725 images and 10,421 DP instances from mobile apps and websites. We then developed DPGuard, a novel automatic tool leveraging commercial multimodal large language models (MLLMs) for deceptive pattern detection. Experimental results show that DPGuard outperforms state-of-the-art methods. Finally, we conducted an extensive empirical evaluation on 2,000 popular mobile apps and websites, revealing that 23.61% of mobile screenshots and 47.27% of website screenshots feature at least one deceptive pattern instance. Through four unexplored case studies that inform security implications, we highlight the critical importance of the unified taxonomy in addressing the growing challenges of Internet deception. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13351v3-abstract-full').style.display = 'none'; document.getElementById('2501.13351v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper has been accepted by The Web Conference 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.13054">arXiv:2501.13054</a> <span> [<a href="https://arxiv.org/pdf/2501.13054">pdf</a>, <a href="https://arxiv.org/format/2501.13054">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> STMDNet: A Lightweight Directional Framework for Motion Pattern Recognition of Tiny Targets </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+M">Mingshuo Xu</a>, <a href="/search/cs?searchtype=author&query=Luan%2C+H">Hao Luan</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+Z+D">Zhou Daniel Hao</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+J">Jigen Peng</a>, <a href="/search/cs?searchtype=author&query=Yue%2C+S">Shigang Yue</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.13054v1-abstract-short" style="display: inline;"> Recognizing motions of tiny targets - only few dozen pixels - in cluttered backgrounds remains a fundamental challenge when standard feature-based or deep learning methods fail under scarce visual cues. We propose STMDNet, a model-based computational framework to Recognize motions of tiny targets at variable velocities under low-sampling frequency scenarios. STMDNet designs a novel dual-dynamics-a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13054v1-abstract-full').style.display = 'inline'; document.getElementById('2501.13054v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.13054v1-abstract-full" style="display: none;"> Recognizing motions of tiny targets - only few dozen pixels - in cluttered backgrounds remains a fundamental challenge when standard feature-based or deep learning methods fail under scarce visual cues. We propose STMDNet, a model-based computational framework to Recognize motions of tiny targets at variable velocities under low-sampling frequency scenarios. STMDNet designs a novel dual-dynamics-and-correlation mechanism, harnessing ipsilateral excitation to integrate target cues and leakage-enhancing-type contralateral inhibition to suppress large-object and background motion interference. Moreover, we develop the first collaborative directional encoding-decoding strategy that determines the motion direction from only one correlation per spatial location, cutting computational costs to one-eighth of prior methods. Further, simply substituting the backbone of a strong STMD model with STMDNet raises AUC by 24%, yielding an enhanced STMDNet-F. Evaluations on real-world low sampling frequency datasets show state-of-the-art results, surpassing the deep learning baseline. Across diverse speeds, STMDNet-F improves mF1 by 19%, 16%, and 8% at 240Hz, 120Hz, and 60Hz, respectively, while STMDNet achieves 87 FPS on a single CPU thread. These advances highlight STMDNet as a next-generation backbone for tiny target motion pattern recognition and underscore its broader potential to revitalize model-based visual approaches in motion detection. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13054v1-abstract-full').style.display = 'none'; document.getElementById('2501.13054v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 8 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.12860">arXiv:2501.12860</a> <span> [<a href="https://arxiv.org/pdf/2501.12860">pdf</a>, <a href="https://arxiv.org/format/2501.12860">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> CrossDiff: Diffusion Probabilistic Model With Cross-conditional Encoder-Decoder for Crack Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shi%2C+X">Xianglong Shi</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Y">Yunhan Jiang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+X">Xiaoheng Jiang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+M">Mingling Xu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yang Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.12860v1-abstract-short" style="display: inline;"> Crack Segmentation in industrial concrete surfaces is a challenging task because cracks usually exhibit intricate morphology with slender appearances. Traditional segmentation methods often struggle to accurately locate such cracks, leading to inefficiencies in maintenance and repair processes. In this paper, we propose a novel diffusion-based model with a cross-conditional encoder-decoder, named… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12860v1-abstract-full').style.display = 'inline'; document.getElementById('2501.12860v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.12860v1-abstract-full" style="display: none;"> Crack Segmentation in industrial concrete surfaces is a challenging task because cracks usually exhibit intricate morphology with slender appearances. Traditional segmentation methods often struggle to accurately locate such cracks, leading to inefficiencies in maintenance and repair processes. In this paper, we propose a novel diffusion-based model with a cross-conditional encoder-decoder, named CrossDiff, which is the first to introduce the diffusion probabilistic model for the crack segmentation task. Specifically, CrossDiff integrates a cross-encoder and a cross-decoder into the diffusion model to constitute a cross-shaped diffusion model structure. The cross-encoder enhances the ability to retain crack details and the cross-decoder helps extract the semantic features of cracks. As a result, CrossDiff can better handle slender cracks. Extensive experiments were conducted on five challenging crack datasets including CFD, CrackTree200, DeepCrack, GAPs384, and Rissbilder. The results demonstrate that the proposed CrossDiff model achieves impressive performance, outperforming other state-of-the-art methods by 8.0% in terms of both Dice score and IoU. The code will be open-source soon. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12860v1-abstract-full').style.display = 'none'; document.getElementById('2501.12860v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.12622">arXiv:2501.12622</a> <span> [<a href="https://arxiv.org/pdf/2501.12622">pdf</a>, <a href="https://arxiv.org/format/2501.12622">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Towards Robust Multi-tab Website Fingerprinting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Deng%2C+X">Xinhao Deng</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+X">Xiyuan Zhao</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+Q">Qilei Yin</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhuotao Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Q">Qi Li</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+M">Mingwei Xu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+K">Ke Xu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jianping Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.12622v1-abstract-short" style="display: inline;"> Website fingerprinting enables an eavesdropper to determine which websites a user is visiting over an encrypted connection. State-of-the-art website fingerprinting (WF) attacks have demonstrated effectiveness even against Tor-protected network traffic. However, existing WF attacks have critical limitations on accurately identifying websites in multi-tab browsing sessions, where the holistic patter… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12622v1-abstract-full').style.display = 'inline'; document.getElementById('2501.12622v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.12622v1-abstract-full" style="display: none;"> Website fingerprinting enables an eavesdropper to determine which websites a user is visiting over an encrypted connection. State-of-the-art website fingerprinting (WF) attacks have demonstrated effectiveness even against Tor-protected network traffic. However, existing WF attacks have critical limitations on accurately identifying websites in multi-tab browsing sessions, where the holistic pattern of individual websites is no longer preserved, and the number of tabs opened by a client is unknown a priori. In this paper, we propose ARES, a novel WF framework natively designed for multi-tab WF attacks. ARES formulates the multi-tab attack as a multi-label classification problem and solves it using the novel Transformer-based models. Specifically, ARES extracts local patterns based on multi-level traffic aggregation features and utilizes the improved self-attention mechanism to analyze the correlations between these local patterns, effectively identifying websites. We implement a prototype of ARES and extensively evaluate its effectiveness using our large-scale datasets collected over multiple months. The experimental results illustrate that ARES achieves optimal performance in several realistic scenarios. Further, ARES remains robust even against various WF defenses. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12622v1-abstract-full').style.display = 'none'; document.getElementById('2501.12622v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.11782">arXiv:2501.11782</a> <span> [<a href="https://arxiv.org/pdf/2501.11782">pdf</a>, <a href="https://arxiv.org/format/2501.11782">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Human-AI Collaborative Game Testing with Vision Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Boran Zhang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+M">Muhan Xu</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+Z">Zhijun Pan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.11782v1-abstract-short" style="display: inline;"> As modern video games become increasingly complex, traditional manual testing methods are proving costly and inefficient, limiting the ability to ensure high-quality game experiences. While advancements in Artificial Intelligence (AI) offer the potential to assist human testers, the effectiveness of AI in truly enhancing real-world human performance remains underexplored. This study investigates h… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.11782v1-abstract-full').style.display = 'inline'; document.getElementById('2501.11782v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.11782v1-abstract-full" style="display: none;"> As modern video games become increasingly complex, traditional manual testing methods are proving costly and inefficient, limiting the ability to ensure high-quality game experiences. While advancements in Artificial Intelligence (AI) offer the potential to assist human testers, the effectiveness of AI in truly enhancing real-world human performance remains underexplored. This study investigates how AI can improve game testing by developing and experimenting with an AI-assisted workflow that leverages state-of-the-art machine learning models for defect detection. Through an experiment involving 800 test cases and 276 participants of varying backgrounds, we evaluate the effectiveness of AI assistance under four conditions: with or without AI support, and with or without detailed knowledge of defects and design documentation. The results indicate that AI assistance significantly improves defect identification performance, particularly when paired with detailed knowledge. However, challenges arise when AI errors occur, negatively impacting human decision-making. Our findings show the importance of optimizing human-AI collaboration and implementing strategies to mitigate the effects of AI inaccuracies. By this research, we demonstrate AI's potential and problems in enhancing efficiency and accuracy in game testing workflows and offers practical insights for integrating AI into the testing process. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.11782v1-abstract-full').style.display = 'none'; document.getElementById('2501.11782v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Experiment Report</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.09283">arXiv:2501.09283</a> <span> [<a href="https://arxiv.org/pdf/2501.09283">pdf</a>, <a href="https://arxiv.org/format/2501.09283">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Free-Knots Kolmogorov-Arnold Network: On the Analysis of Spline Knots and Advancing Stability </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zheng%2C+L+N">Liangwewi Nathan Zheng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W+E">Wei Emma Zhang</a>, <a href="/search/cs?searchtype=author&query=Yue%2C+L">Lin Yue</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+M">Miao Xu</a>, <a href="/search/cs?searchtype=author&query=Maennel%2C+O">Olaf Maennel</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Weitong Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.09283v1-abstract-short" style="display: inline;"> Kolmogorov-Arnold Neural Networks (KANs) have gained significant attention in the machine learning community. However, their implementation often suffers from poor training stability and heavy trainable parameter. Furthermore, there is limited understanding of the behavior of the learned activation functions derived from B-splines. In this work, we analyze the behavior of KANs through the lens of… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.09283v1-abstract-full').style.display = 'inline'; document.getElementById('2501.09283v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.09283v1-abstract-full" style="display: none;"> Kolmogorov-Arnold Neural Networks (KANs) have gained significant attention in the machine learning community. However, their implementation often suffers from poor training stability and heavy trainable parameter. Furthermore, there is limited understanding of the behavior of the learned activation functions derived from B-splines. In this work, we analyze the behavior of KANs through the lens of spline knots and derive the lower and upper bound for the number of knots in B-spline-based KANs. To address existing limitations, we propose a novel Free Knots KAN that enhances the performance of the original KAN while reducing the number of trainable parameters to match the trainable parameter scale of standard Multi-Layer Perceptrons (MLPs). Additionally, we introduce new a training strategy to ensure $C^2$ continuity of the learnable spline, resulting in smoother activation compared to the original KAN and improve the training stability by range expansion. The proposed method is comprehensively evaluated on 8 datasets spanning various domains, including image, text, time series, multimodal, and function approximation tasks. The promising results demonstrates the feasibility of KAN-based network and the effectiveness of proposed method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.09283v1-abstract-full').style.display = 'none'; document.getElementById('2501.09283v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.09218">arXiv:2501.09218</a> <span> [<a href="https://arxiv.org/pdf/2501.09218">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantitative Methods">q-bio.QM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Interpretable Droplet Digital PCR Assay for Trustworthy Molecular Diagnostics </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wei%2C+Y">Yuanyuan Wei</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yucheng Wu</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+F">Fuyang Qu</a>, <a href="/search/cs?searchtype=author&query=Mu%2C+Y">Yao Mu</a>, <a href="/search/cs?searchtype=author&query=Ho%2C+Y">Yi-Ping Ho</a>, <a href="/search/cs?searchtype=author&query=Ho%2C+H">Ho-Pui Ho</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+W">Wu Yuan</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+M">Mingkun Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.09218v1-abstract-short" style="display: inline;"> Accurate molecular quantification is essential for advancing research and diagnostics in fields such as infectious diseases, cancer biology, and genetic disorders. Droplet digital PCR (ddPCR) has emerged as a gold standard for achieving absolute quantification. While computational ddPCR technologies have advanced significantly, achieving automatic interpretation and consistent adaptability across… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.09218v1-abstract-full').style.display = 'inline'; document.getElementById('2501.09218v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.09218v1-abstract-full" style="display: none;"> Accurate molecular quantification is essential for advancing research and diagnostics in fields such as infectious diseases, cancer biology, and genetic disorders. Droplet digital PCR (ddPCR) has emerged as a gold standard for achieving absolute quantification. While computational ddPCR technologies have advanced significantly, achieving automatic interpretation and consistent adaptability across diverse operational environments remains a challenge. To address these limitations, we introduce the intelligent interpretable droplet digital PCR (I2ddPCR) assay, a comprehensive framework integrating front-end predictive models (for droplet segmentation and classification) with GPT-4o multimodal large language model (MLLM, for context-aware explanations and recommendations) to automate and enhance ddPCR image analysis. This approach surpasses the state-of-the-art models, affording 99.05% accuracy in processing complex ddPCR images containing over 300 droplets per image with varying signal-to-noise ratios (SNRs). By combining specialized neural networks and large language models, the I2ddPCR assay offers a robust and adaptable solution for absolute molecular quantification, achieving a sensitivity capable of detecting low-abundance targets as low as 90.32 copies/渭L. Furthermore, it improves model's transparency through detailed explanation and troubleshooting guidance, empowering users to make informed decisions. This innovative framework has the potential to benefit molecular diagnostics, disease research, and clinical applications, especially in resource-constrained settings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.09218v1-abstract-full').style.display = 'none'; document.getElementById('2501.09218v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.08062">arXiv:2501.08062</a> <span> [<a href="https://arxiv.org/pdf/2501.08062">pdf</a>, <a href="https://arxiv.org/format/2501.08062">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Skeleton and Font Generation Network for Zero-shot Chinese Character Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xue%2C+M">Mobai Xue</a>, <a href="/search/cs?searchtype=author&query=Du%2C+J">Jun Du</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhenrong Zhang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+J">Jiefeng Ma</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+Q">Qikai Chang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+P">Pengfei Hu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jianshu Zhang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yu Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.08062v1-abstract-short" style="display: inline;"> Automatic font generation remains a challenging research issue, primarily due to the vast number of Chinese characters, each with unique and intricate structures. Our investigation of previous studies reveals inherent bias capable of causing structural changes in characters. Specifically, when generating a Chinese character similar to, but different from, those in the training samples, the bias is… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.08062v1-abstract-full').style.display = 'inline'; document.getElementById('2501.08062v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.08062v1-abstract-full" style="display: none;"> Automatic font generation remains a challenging research issue, primarily due to the vast number of Chinese characters, each with unique and intricate structures. Our investigation of previous studies reveals inherent bias capable of causing structural changes in characters. Specifically, when generating a Chinese character similar to, but different from, those in the training samples, the bias is prone to either correcting or ignoring these subtle variations. To address this concern, we propose a novel Skeleton and Font Generation Network (SFGN) to achieve a more robust Chinese character font generation. Our approach includes a skeleton builder and font generator. The skeleton builder synthesizes content features using low-resource text input, enabling our technique to realize font generation independently of content image inputs. Unlike previous font generation methods that treat font style as a global embedding, we introduce a font generator to align content and style features on the radical level, which is a brand-new perspective for font generation. Except for common characters, we also conduct experiments on misspelled characters, a substantial portion of which slightly differs from the common ones. Our approach visually demonstrates the efficacy of generated images and outperforms current state-of-the-art font generation methods. Moreover, we believe that misspelled character generation have significant pedagogical implications and verify such supposition through experiments. We used generated misspelled characters as data augmentation in Chinese character error correction tasks, simulating the scenario where students learn handwritten Chinese characters with the help of misspelled characters. The significantly improved performance of error correction tasks demonstrates the effectiveness of our proposed approach and the value of misspelled character generation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.08062v1-abstract-full').style.display = 'none'; document.getElementById('2501.08062v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">36 pages, 10 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.06818">arXiv:2501.06818</a> <span> [<a href="https://arxiv.org/pdf/2501.06818">pdf</a>, <a href="https://arxiv.org/format/2501.06818">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> UR2P-Dehaze: Learning a Simple Image Dehaze Enhancer via Unpaired Rich Physical Prior </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xue%2C+M">Minglong Xue</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+S">Shuaibin Fan</a>, <a href="/search/cs?searchtype=author&query=Palaiahnakote%2C+S">Shivakumara Palaiahnakote</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+M">Mingliang Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.06818v1-abstract-short" style="display: inline;"> Image dehazing techniques aim to enhance contrast and restore details, which are essential for preserving visual information and improving image processing accuracy. Existing methods rely on a single manual prior, which cannot effectively reveal image details. To overcome this limitation, we propose an unpaired image dehazing network, called the Simple Image Dehaze Enhancer via Unpaired Rich Physi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06818v1-abstract-full').style.display = 'inline'; document.getElementById('2501.06818v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.06818v1-abstract-full" style="display: none;"> Image dehazing techniques aim to enhance contrast and restore details, which are essential for preserving visual information and improving image processing accuracy. Existing methods rely on a single manual prior, which cannot effectively reveal image details. To overcome this limitation, we propose an unpaired image dehazing network, called the Simple Image Dehaze Enhancer via Unpaired Rich Physical Prior (UR2P-Dehaze). First, to accurately estimate the illumination, reflectance, and color information of the hazy image, we design a shared prior estimator (SPE) that is iteratively trained to ensure the consistency of illumination and reflectance, generating clear, high-quality images. Additionally, a self-monitoring mechanism is introduced to eliminate undesirable features, providing reliable priors for image reconstruction. Next, we propose Dynamic Wavelet Separable Convolution (DWSC), which effectively integrates key features across both low and high frequencies, significantly enhancing the preservation of image details and ensuring global consistency. Finally, to effectively restore the color information of the image, we propose an Adaptive Color Corrector that addresses the problem of unclear colors. The PSNR, SSIM, LPIPS, FID and CIEDE2000 metrics on the benchmark dataset show that our method achieves state-of-the-art performance. It also contributes to the performance improvement of downstream tasks. The project code will be available at https://github.com/Fan-pixel/UR2P-Dehaze. \end{abstract} <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06818v1-abstract-full').style.display = 'none'; document.getElementById('2501.06818v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.06277">arXiv:2501.06277</a> <span> [<a href="https://arxiv.org/pdf/2501.06277">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Environmental large language model Evaluation (ELLE) dataset: A Benchmark for Evaluating Generative AI applications in Eco-environment Domain </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+J">Jing Guo</a>, <a href="/search/cs?searchtype=author&query=Li%2C+N">Nan Li</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+M">Ming Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.06277v1-abstract-short" style="display: inline;"> Generative AI holds significant potential for ecological and environmental applications such as monitoring, data analysis, education, and policy support. However, its effectiveness is limited by the lack of a unified evaluation framework. To address this, we present the Environmental Large Language model Evaluation (ELLE) question answer (QA) dataset, the first benchmark designed to assess large l… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06277v1-abstract-full').style.display = 'inline'; document.getElementById('2501.06277v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.06277v1-abstract-full" style="display: none;"> Generative AI holds significant potential for ecological and environmental applications such as monitoring, data analysis, education, and policy support. However, its effectiveness is limited by the lack of a unified evaluation framework. To address this, we present the Environmental Large Language model Evaluation (ELLE) question answer (QA) dataset, the first benchmark designed to assess large language models and their applications in ecological and environmental sciences. The ELLE dataset includes 1,130 question answer pairs across 16 environmental topics, categorized by domain, difficulty, and type. This comprehensive dataset standardizes performance assessments in these fields, enabling consistent and objective comparisons of generative AI performance. By providing a dedicated evaluation tool, ELLE dataset promotes the development and application of generative AI technologies for sustainable environmental outcomes. The dataset and code are available at https://elle.ceeai.net/ and https://github.com/CEEAI/elle. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06277v1-abstract-full').style.display = 'none'; document.getElementById('2501.06277v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.04860">arXiv:2501.04860</a> <span> [<a href="https://arxiv.org/pdf/2501.04860">pdf</a>, <a href="https://arxiv.org/format/2501.04860">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Exploring the Use of Robots for Diary Studies </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+M+F">Michael F. Xu</a>, <a href="/search/cs?searchtype=author&query=Mutlu%2C+B">Bilge Mutlu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.04860v2-abstract-short" style="display: inline;"> As interest in studying in-the-wild human-robot interaction grows, there is a need for methods to collect data over time and in naturalistic or potentially private environments. HRI researchers have increasingly used the diary method for these studies, asking study participants to self-administer a structured data collection instrument, i.e., a diary, over a period of time. Although the diary meth… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.04860v2-abstract-full').style.display = 'inline'; document.getElementById('2501.04860v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.04860v2-abstract-full" style="display: none;"> As interest in studying in-the-wild human-robot interaction grows, there is a need for methods to collect data over time and in naturalistic or potentially private environments. HRI researchers have increasingly used the diary method for these studies, asking study participants to self-administer a structured data collection instrument, i.e., a diary, over a period of time. Although the diary method offers a unique window into settings that researchers may not have access to, they also lack the interactivity and probing that interview-based methods offer. In this paper, we explore a novel data collection method in which a robot plays the role of an interactive diary. We developed the Diary Robot system and performed in-home deployments for a week to evaluate the feasibility and effectiveness of this approach. Using traditional text-based and audio-based diaries as benchmarks, we found that robots are able to effectively elicit the intended information. We reflect on our findings, and describe scenarios where the utilization of robots in diary studies as a data collection instrument may be especially applicable. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.04860v2-abstract-full').style.display = 'none'; document.getElementById('2501.04860v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Proceedings of the 20th ACM/IEEE International Conference on Human Robot Interaction (HRI 2025)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.04102">arXiv:2501.04102</a> <span> [<a href="https://arxiv.org/pdf/2501.04102">pdf</a>, <a href="https://arxiv.org/format/2501.04102">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Distribution and Label Consistency for Graph Out-of-Distribution Generalization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+S">Song Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xiaodong Yang</a>, <a href="/search/cs?searchtype=author&query=Islam%2C+R">Rashidul Islam</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Huiyuan Chen</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+M">Minghua Xu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jundong Li</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+Y">Yiwei Cai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.04102v1-abstract-short" style="display: inline;"> To deal with distribution shifts in graph data, various graph out-of-distribution (OOD) generalization techniques have been recently proposed. These methods often employ a two-step strategy that first creates augmented environments and subsequently identifies invariant subgraphs to improve generalizability. Nevertheless, this approach could be suboptimal from the perspective of consistency. First,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.04102v1-abstract-full').style.display = 'inline'; document.getElementById('2501.04102v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.04102v1-abstract-full" style="display: none;"> To deal with distribution shifts in graph data, various graph out-of-distribution (OOD) generalization techniques have been recently proposed. These methods often employ a two-step strategy that first creates augmented environments and subsequently identifies invariant subgraphs to improve generalizability. Nevertheless, this approach could be suboptimal from the perspective of consistency. First, the process of augmenting environments by altering the graphs while preserving labels may lead to graphs that are not realistic or meaningfully related to the origin distribution, thus lacking distribution consistency. Second, the extracted subgraphs are obtained from directly modifying graphs, and may not necessarily maintain a consistent predictive relationship with their labels, thereby impacting label consistency. In response to these challenges, we introduce an innovative approach that aims to enhance these two types of consistency for graph OOD generalization. We propose a modifier to obtain both augmented and invariant graphs in a unified manner. With the augmented graphs, we enrich the training data without compromising the integrity of label-graph relationships. The label consistency enhancement in our framework further preserves the supervision information in the invariant graph. We conduct extensive experiments on real-world datasets to demonstrate the superiority of our framework over other state-of-the-art baselines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.04102v1-abstract-full').style.display = 'none'; document.getElementById('2501.04102v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICDM 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.02863">arXiv:2501.02863</a> <span> [<a href="https://arxiv.org/pdf/2501.02863">pdf</a>, <a href="https://arxiv.org/format/2501.02863">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> Beyond Pass or Fail: Multi-Dimensional Benchmarking of Foundation Models for Goal-based Mobile UI Navigation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ran%2C+D">Dezhi Ran</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+M">Mengzhou Wu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+H">Hao Yu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yuetong Li</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+J">Jun Ren</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+Y">Yuan Cao</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+X">Xia Zeng</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Haochuan Lu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zexin Xu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+M">Mengqian Xu</a>, <a href="/search/cs?searchtype=author&query=Su%2C+T">Ting Su</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+L">Liangchao Yao</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+T">Ting Xiong</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+W">Wei Yang</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+Y">Yuetang Deng</a>, <a href="/search/cs?searchtype=author&query=Marron%2C+A">Assaf Marron</a>, <a href="/search/cs?searchtype=author&query=Harel%2C+D">David Harel</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+T">Tao Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.02863v2-abstract-short" style="display: inline;"> Recent advances of foundation models (FMs) have made navigating mobile applications (apps) based on high-level goal instructions within reach, with significant industrial applications such as UI testing. While existing benchmarks evaluate FM-based UI navigation using the binary pass/fail metric, they have two major limitations: they cannot reflect the complex nature of mobile UI navigation where F… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02863v2-abstract-full').style.display = 'inline'; document.getElementById('2501.02863v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.02863v2-abstract-full" style="display: none;"> Recent advances of foundation models (FMs) have made navigating mobile applications (apps) based on high-level goal instructions within reach, with significant industrial applications such as UI testing. While existing benchmarks evaluate FM-based UI navigation using the binary pass/fail metric, they have two major limitations: they cannot reflect the complex nature of mobile UI navigation where FMs may fail for various reasons (e.g., misunderstanding instructions and failed planning), and they lack industrial relevance due to oversimplified tasks that poorly represent real-world scenarios. To address the preceding limitations, we propose Sphinx, a comprehensive benchmark for multi-dimensional evaluation of FMs in industrial settings of UI navigation. Sphinx introduces a specialized toolkit that evaluates five essential FM capabilities, providing detailed insights into failure modes such as insufficient app knowledge or planning issues. Using both popular Google Play applications and WeChat's internal UI test cases, we evaluate 8 FMs with 20 different configurations. Our results show that existing FMs universally struggle with goal-based testing tasks, primarily due to insufficient UI-specific capabilities. We summarize seven lessons learned from benchmarking FMs with Sphinx, providing clear directions for improving FM-based mobile UI navigation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02863v2-abstract-full').style.display = 'none'; document.getElementById('2501.02863v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.01030">arXiv:2501.01030</a> <span> [<a href="https://arxiv.org/pdf/2501.01030">pdf</a>, <a href="https://arxiv.org/format/2501.01030">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Reasoning based on symbolic and parametric knowledge bases: a survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+M">Mayi Xu</a>, <a href="/search/cs?searchtype=author&query=Ning%2C+Y">Yunfeng Ning</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yongqi Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jianhao Chen</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+J">Jintao Wen</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+Y">Yao Xiao</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+S">Shen Zhou</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+B">Birong Pan</a>, <a href="/search/cs?searchtype=author&query=Bao%2C+Z">Zepeng Bao</a>, <a href="/search/cs?searchtype=author&query=Miao%2C+X">Xin Miao</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+H">Hankun Kang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+K">Ke Sun</a>, <a href="/search/cs?searchtype=author&query=Qian%2C+T">Tieyun Qian</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.01030v1-abstract-short" style="display: inline;"> Reasoning is fundamental to human intelligence, and critical for problem-solving, decision-making, and critical thinking. Reasoning refers to drawing new conclusions based on existing knowledge, which can support various applications like clinical diagnosis, basic education, and financial analysis. Though a good number of surveys have been proposed for reviewing reasoning-related methods, none of… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.01030v1-abstract-full').style.display = 'inline'; document.getElementById('2501.01030v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.01030v1-abstract-full" style="display: none;"> Reasoning is fundamental to human intelligence, and critical for problem-solving, decision-making, and critical thinking. Reasoning refers to drawing new conclusions based on existing knowledge, which can support various applications like clinical diagnosis, basic education, and financial analysis. Though a good number of surveys have been proposed for reviewing reasoning-related methods, none of them has systematically investigated these methods from the viewpoint of their dependent knowledge base. Both the scenarios to which the knowledge bases are applied and their storage formats are significantly different. Hence, investigating reasoning methods from the knowledge base perspective helps us better understand the challenges and future directions. To fill this gap, this paper first classifies the knowledge base into symbolic and parametric ones. The former explicitly stores information in human-readable symbols, and the latter implicitly encodes knowledge within parameters. Then, we provide a comprehensive overview of reasoning methods using symbolic knowledge bases, parametric knowledge bases, and both of them. Finally, we identify the future direction toward enhancing reasoning capabilities to bridge the gap between human and machine intelligence. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.01030v1-abstract-full').style.display = 'none'; document.getElementById('2501.01030v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.20685">arXiv:2412.20685</a> <span> [<a href="https://arxiv.org/pdf/2412.20685">pdf</a>, <a href="https://arxiv.org/format/2412.20685">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> MarsSQE: Stereo Quality Enhancement for Martian Images Using Bi-level Cross-view Attention </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+M">Mai Xu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yinglin Zhu</a>, <a href="/search/cs?searchtype=author&query=Xing%2C+Q">Qunliang Xing</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jing Yang</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+X">Xin Zou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.20685v1-abstract-short" style="display: inline;"> Stereo images captured by Mars rovers are transmitted after lossy compression due to the limited bandwidth between Mars and Earth. Unfortunately, this process results in undesirable compression artifacts. In this paper, we present a novel stereo quality enhancement approach for Martian images, named MarsSQE. First, we establish the first dataset of stereo Martian images. Through extensive analysis… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.20685v1-abstract-full').style.display = 'inline'; document.getElementById('2412.20685v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.20685v1-abstract-full" style="display: none;"> Stereo images captured by Mars rovers are transmitted after lossy compression due to the limited bandwidth between Mars and Earth. Unfortunately, this process results in undesirable compression artifacts. In this paper, we present a novel stereo quality enhancement approach for Martian images, named MarsSQE. First, we establish the first dataset of stereo Martian images. Through extensive analysis of this dataset, we observe that cross-view correlations in Martian images are notably high. Leveraging this insight, we design a bi-level cross-view attention-based quality enhancement network that fully exploits these inherent cross-view correlations. Specifically, our network integrates pixel-level attention for precise matching and patch-level attention for broader contextual information. Experimental results demonstrate the effectiveness of our MarsSQE approach. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.20685v1-abstract-full').style.display = 'none'; document.getElementById('2412.20685v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.19877">arXiv:2412.19877</a> <span> [<a href="https://arxiv.org/pdf/2412.19877">pdf</a>, <a href="https://arxiv.org/format/2412.19877">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Image Classification with Deep Reinforcement Active Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiu%2C+M">Mingyuan Jiu</a>, <a href="/search/cs?searchtype=author&query=Song%2C+X">Xuguang Song</a>, <a href="/search/cs?searchtype=author&query=Sahbi%2C+H">Hichem Sahbi</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shupan Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yan Chen</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+W">Wei Guo</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+L">Lihua Guo</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+M">Mingliang Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.19877v1-abstract-short" style="display: inline;"> Deep learning is currently reaching outstanding performances on different tasks, including image classification, especially when using large neural networks. The success of these models is tributary to the availability of large collections of labeled training data. In many real-world scenarios, labeled data are scarce, and their hand-labeling is time, effort and cost demanding. Active learning is… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.19877v1-abstract-full').style.display = 'inline'; document.getElementById('2412.19877v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.19877v1-abstract-full" style="display: none;"> Deep learning is currently reaching outstanding performances on different tasks, including image classification, especially when using large neural networks. The success of these models is tributary to the availability of large collections of labeled training data. In many real-world scenarios, labeled data are scarce, and their hand-labeling is time, effort and cost demanding. Active learning is an alternative paradigm that mitigates the effort in hand-labeling data, where only a small fraction is iteratively selected from a large pool of unlabeled data, and annotated by an expert (a.k.a oracle), and eventually used to update the learning models. However, existing active learning solutions are dependent on handcrafted strategies that may fail in highly variable learning environments (datasets, scenarios, etc). In this work, we devise an adaptive active learning method based on Markov Decision Process (MDP). Our framework leverages deep reinforcement learning and active learning together with a Deep Deterministic Policy Gradient (DDPG) in order to dynamically adapt sample selection strategies to the oracle's feedback and the learning environment. Extensive experiments conducted on three different image classification benchmarks show superior performances against several existing active learning strategies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.19877v1-abstract-full').style.display = 'none'; document.getElementById('2412.19877v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.18216">arXiv:2412.18216</a> <span> [<a href="https://arxiv.org/pdf/2412.18216">pdf</a>, <a href="https://arxiv.org/format/2412.18216">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> ICM-Assistant: Instruction-tuning Multimodal Large Language Models for Rule-based Explainable Image Content Moderation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+M">Mengyang Wu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yuzhi Zhao</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+J">Jialun Cao</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+M">Mingjie Xu</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Zhongming Jiang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xuehui Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Q">Qinbin Li</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+G">Guangneng Hu</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+S">Shengchao Qin</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+C">Chi-Wing Fu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.18216v2-abstract-short" style="display: inline;"> Controversial contents largely inundate the Internet, infringing various cultural norms and child protection standards. Traditional Image Content Moderation (ICM) models fall short in producing precise moderation decisions for diverse standards, while recent multimodal large language models (MLLMs), when adopted to general rule-based ICM, often produce classification and explanation results that a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18216v2-abstract-full').style.display = 'inline'; document.getElementById('2412.18216v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.18216v2-abstract-full" style="display: none;"> Controversial contents largely inundate the Internet, infringing various cultural norms and child protection standards. Traditional Image Content Moderation (ICM) models fall short in producing precise moderation decisions for diverse standards, while recent multimodal large language models (MLLMs), when adopted to general rule-based ICM, often produce classification and explanation results that are inconsistent with human moderators. Aiming at flexible, explainable, and accurate ICM, we design a novel rule-based dataset generation pipeline, decomposing concise human-defined rules and leveraging well-designed multi-stage prompts to enrich short explicit image annotations. Our ICM-Instruct dataset includes detailed moderation explanation and moderation Q-A pairs. Built upon it, we create our ICM-Assistant model in the framework of rule-based ICM, making it readily applicable in real practice. Our ICM-Assistant model demonstrates exceptional performance and flexibility. Specifically, it significantly outperforms existing approaches on various sources, improving both the moderation classification (36.8% on average) and moderation explanation quality (26.6% on average) consistently over existing MLLMs. Code/Data is available at https://github.com/zhaoyuzhi/ICM-Assistant. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18216v2-abstract-full').style.display = 'none'; document.getElementById('2412.18216v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by the AAAI 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.16720">arXiv:2412.16720</a> <span> [<a href="https://arxiv.org/pdf/2412.16720">pdf</a>, <a href="https://arxiv.org/format/2412.16720">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> OpenAI o1 System Card </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=OpenAI"> OpenAI</a>, <a href="/search/cs?searchtype=author&query=%3A"> :</a>, <a href="/search/cs?searchtype=author&query=Jaech%2C+A">Aaron Jaech</a>, <a href="/search/cs?searchtype=author&query=Kalai%2C+A">Adam Kalai</a>, <a href="/search/cs?searchtype=author&query=Lerer%2C+A">Adam Lerer</a>, <a href="/search/cs?searchtype=author&query=Richardson%2C+A">Adam Richardson</a>, <a href="/search/cs?searchtype=author&query=El-Kishky%2C+A">Ahmed El-Kishky</a>, <a href="/search/cs?searchtype=author&query=Low%2C+A">Aiden Low</a>, <a href="/search/cs?searchtype=author&query=Helyar%2C+A">Alec Helyar</a>, <a href="/search/cs?searchtype=author&query=Madry%2C+A">Aleksander Madry</a>, <a href="/search/cs?searchtype=author&query=Beutel%2C+A">Alex Beutel</a>, <a href="/search/cs?searchtype=author&query=Carney%2C+A">Alex Carney</a>, <a href="/search/cs?searchtype=author&query=Iftimie%2C+A">Alex Iftimie</a>, <a href="/search/cs?searchtype=author&query=Karpenko%2C+A">Alex Karpenko</a>, <a href="/search/cs?searchtype=author&query=Passos%2C+A+T">Alex Tachard Passos</a>, <a href="/search/cs?searchtype=author&query=Neitz%2C+A">Alexander Neitz</a>, <a href="/search/cs?searchtype=author&query=Prokofiev%2C+A">Alexander Prokofiev</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+A">Alexander Wei</a>, <a href="/search/cs?searchtype=author&query=Tam%2C+A">Allison Tam</a>, <a href="/search/cs?searchtype=author&query=Bennett%2C+A">Ally Bennett</a>, <a href="/search/cs?searchtype=author&query=Kumar%2C+A">Ananya Kumar</a>, <a href="/search/cs?searchtype=author&query=Saraiva%2C+A">Andre Saraiva</a>, <a href="/search/cs?searchtype=author&query=Vallone%2C+A">Andrea Vallone</a>, <a href="/search/cs?searchtype=author&query=Duberstein%2C+A">Andrew Duberstein</a>, <a href="/search/cs?searchtype=author&query=Kondrich%2C+A">Andrew Kondrich</a> , et al. (238 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.16720v1-abstract-short" style="display: inline;"> The o1 model series is trained with large-scale reinforcement learning to reason using chain of thought. These advanced reasoning capabilities provide new avenues for improving the safety and robustness of our models. In particular, our models can reason about our safety policies in context when responding to potentially unsafe prompts, through deliberative alignment. This leads to state-of-the-ar… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16720v1-abstract-full').style.display = 'inline'; document.getElementById('2412.16720v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.16720v1-abstract-full" style="display: none;"> The o1 model series is trained with large-scale reinforcement learning to reason using chain of thought. These advanced reasoning capabilities provide new avenues for improving the safety and robustness of our models. In particular, our models can reason about our safety policies in context when responding to potentially unsafe prompts, through deliberative alignment. This leads to state-of-the-art performance on certain benchmarks for risks such as generating illicit advice, choosing stereotyped responses, and succumbing to known jailbreaks. Training models to incorporate a chain of thought before answering has the potential to unlock substantial benefits, while also increasing potential risks that stem from heightened intelligence. Our results underscore the need for building robust alignment methods, extensively stress-testing their efficacy, and maintaining meticulous risk management protocols. This report outlines the safety work carried out for the OpenAI o1 and OpenAI o1-mini models, including safety evaluations, external red teaming, and Preparedness Framework evaluations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16720v1-abstract-full').style.display = 'none'; document.getElementById('2412.16720v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.15622">arXiv:2412.15622</a> <span> [<a href="https://arxiv.org/pdf/2412.15622">pdf</a>, <a href="https://arxiv.org/format/2412.15622">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> TouchASP: Elastic Automatic Speech Perception that Everyone Can Touch </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Song%2C+X">Xingchen Song</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+C">Chengdong Liang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Binbin Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+P">Pengshen Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">ZiYu Wang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Y">Youcheng Ma</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+M">Menglong Xu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Lin Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+D">Di Wu</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+F">Fuping Pan</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+D">Dinghao Zhou</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+Z">Zhendong Peng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.15622v1-abstract-short" style="display: inline;"> Large Automatic Speech Recognition (ASR) models demand a vast number of parameters, copious amounts of data, and significant computational resources during the training process. However, such models can merely be deployed on high-compute cloud platforms and are only capable of performing speech recognition tasks. This leads to high costs and restricted capabilities. In this report, we initially pr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.15622v1-abstract-full').style.display = 'inline'; document.getElementById('2412.15622v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.15622v1-abstract-full" style="display: none;"> Large Automatic Speech Recognition (ASR) models demand a vast number of parameters, copious amounts of data, and significant computational resources during the training process. However, such models can merely be deployed on high-compute cloud platforms and are only capable of performing speech recognition tasks. This leads to high costs and restricted capabilities. In this report, we initially propose the elastic mixture of the expert (eMoE) model. This model can be trained just once and then be elastically scaled in accordance with deployment requirements. Secondly, we devise an unsupervised data creation and validation procedure and gather millions of hours of audio data from diverse domains for training. Using these two techniques, our system achieves elastic deployment capabilities while reducing the Character Error Rate (CER) on the SpeechIO testsets from 4.98\% to 2.45\%. Thirdly, our model is not only competent in Mandarin speech recognition but also proficient in multilingual, multi-dialect, emotion, gender, and sound event perception. We refer to this as Automatic Speech Perception (ASP), and the perception results are presented in the experimental section. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.15622v1-abstract-full').style.display = 'none'; document.getElementById('2412.15622v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Technical Report</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.15267">arXiv:2412.15267</a> <span> [<a href="https://arxiv.org/pdf/2412.15267">pdf</a>, <a href="https://arxiv.org/format/2412.15267">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Toxicity Detection towards Adaptability to Changing Perturbations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kang%2C+H">Hankun Kang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jianhao Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yongqi Li</a>, <a href="/search/cs?searchtype=author&query=Miao%2C+X">Xin Miao</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+M">Mayi Xu</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+M">Ming Zhong</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yuanyuan Zhu</a>, <a href="/search/cs?searchtype=author&query=Qian%2C+T">Tieyun Qian</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.15267v2-abstract-short" style="display: inline;"> Toxicity detection is crucial for maintaining the peace of the society. While existing methods perform well on normal toxic contents or those generated by specific perturbation methods, they are vulnerable to evolving perturbation patterns. However, in real-world scenarios, malicious users tend to create new perturbation patterns for fooling the detectors. For example, some users may circumvent th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.15267v2-abstract-full').style.display = 'inline'; document.getElementById('2412.15267v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.15267v2-abstract-full" style="display: none;"> Toxicity detection is crucial for maintaining the peace of the society. While existing methods perform well on normal toxic contents or those generated by specific perturbation methods, they are vulnerable to evolving perturbation patterns. However, in real-world scenarios, malicious users tend to create new perturbation patterns for fooling the detectors. For example, some users may circumvent the detector of large language models (LLMs) by adding `I am a scientist' at the beginning of the prompt. In this paper, we introduce a novel problem, i.e., continual learning jailbreak perturbation patterns, into the toxicity detection field. To tackle this problem, we first construct a new dataset generated by 9 types of perturbation patterns, 7 of them are summarized from prior work and 2 of them are developed by us. We then systematically validate the vulnerability of current methods on this new perturbation pattern-aware dataset via both the zero-shot and fine tuned cross-pattern detection. Upon this, we present the domain incremental learning paradigm and the corresponding benchmark to ensure the detector's robustness to dynamically emerging types of perturbed toxic text. Our code and dataset are provided in the appendix and will be publicly available at GitHub, by which we wish to offer new research opportunities for the security-relevant communities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.15267v2-abstract-full').style.display = 'none'; document.getElementById('2412.15267v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.15115">arXiv:2412.15115</a> <span> [<a href="https://arxiv.org/pdf/2412.15115">pdf</a>, <a href="https://arxiv.org/format/2412.15115">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Qwen2.5 Technical Report </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qwen"> Qwen</a>, <a href="/search/cs?searchtype=author&query=%3A"> :</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+A">An Yang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+B">Baosong Yang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Beichen Zhang</a>, <a href="/search/cs?searchtype=author&query=Hui%2C+B">Binyuan Hui</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+B">Bo Zheng</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+B">Bowen Yu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chengyuan Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+D">Dayiheng Liu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+F">Fei Huang</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+H">Haoran Wei</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H">Huan Lin</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jian Yang</a>, <a href="/search/cs?searchtype=author&query=Tu%2C+J">Jianhong Tu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jianwei Zhang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jianxin Yang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jiaxi Yang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jingren Zhou</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+J">Junyang Lin</a>, <a href="/search/cs?searchtype=author&query=Dang%2C+K">Kai Dang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+K">Keming Lu</a>, <a href="/search/cs?searchtype=author&query=Bao%2C+K">Keqin Bao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+K">Kexin Yang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+L">Le Yu</a> , et al. (19 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.15115v2-abstract-short" style="display: inline;"> In this report, we introduce Qwen2.5, a comprehensive series of large language models (LLMs) designed to meet diverse needs. Compared to previous iterations, Qwen 2.5 has been significantly improved during both the pre-training and post-training stages. In terms of pre-training, we have scaled the high-quality pre-training datasets from the previous 7 trillion tokens to 18 trillion tokens. This pr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.15115v2-abstract-full').style.display = 'inline'; document.getElementById('2412.15115v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.15115v2-abstract-full" style="display: none;"> In this report, we introduce Qwen2.5, a comprehensive series of large language models (LLMs) designed to meet diverse needs. Compared to previous iterations, Qwen 2.5 has been significantly improved during both the pre-training and post-training stages. In terms of pre-training, we have scaled the high-quality pre-training datasets from the previous 7 trillion tokens to 18 trillion tokens. This provides a strong foundation for common sense, expert knowledge, and reasoning capabilities. In terms of post-training, we implement intricate supervised finetuning with over 1 million samples, as well as multistage reinforcement learning. Post-training techniques enhance human preference, and notably improve long text generation, structural data analysis, and instruction following. To handle diverse and varied use cases effectively, we present Qwen2.5 LLM series in rich sizes. Open-weight offerings include base and instruction-tuned models, with quantized versions available. In addition, for hosted solutions, the proprietary models currently include two mixture-of-experts (MoE) variants: Qwen2.5-Turbo and Qwen2.5-Plus, both available from Alibaba Cloud Model Studio. Qwen2.5 has demonstrated top-tier performance on a wide range of benchmarks evaluating language understanding, reasoning, mathematics, coding, human preference alignment, etc. Specifically, the open-weight flagship Qwen2.5-72B-Instruct outperforms a number of open and proprietary models and demonstrates competitive performance to the state-of-the-art open-weight model, Llama-3-405B-Instruct, which is around 5 times larger. Qwen2.5-Turbo and Qwen2.5-Plus offer superior cost-effectiveness while performing competitively against GPT-4o-mini and GPT-4o respectively. Additionally, as the foundation, Qwen2.5 models have been instrumental in training specialized models such as Qwen2.5-Math, Qwen2.5-Coder, QwQ, and multimodal models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.15115v2-abstract-full').style.display = 'none'; document.getElementById('2412.15115v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.14630">arXiv:2412.14630</a> <span> [<a href="https://arxiv.org/pdf/2412.14630">pdf</a>, <a href="https://arxiv.org/format/2412.14630">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Unified Image Restoration and Enhancement: Degradation Calibrated Cycle Reconstruction Diffusion Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xue%2C+M">Minglong Xue</a>, <a href="/search/cs?searchtype=author&query=He%2C+J">Jinhong He</a>, <a href="/search/cs?searchtype=author&query=Palaiahnakote%2C+S">Shivakumara Palaiahnakote</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+M">Mingliang Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.14630v1-abstract-short" style="display: inline;"> Image restoration and enhancement are pivotal for numerous computer vision applications, yet unifying these tasks efficiently remains a significant challenge. Inspired by the iterative refinement capabilities of diffusion models, we propose CycleRDM, a novel framework designed to unify restoration and enhancement tasks while achieving high-quality mapping. Specifically, CycleRDM first learns the m… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.14630v1-abstract-full').style.display = 'inline'; document.getElementById('2412.14630v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.14630v1-abstract-full" style="display: none;"> Image restoration and enhancement are pivotal for numerous computer vision applications, yet unifying these tasks efficiently remains a significant challenge. Inspired by the iterative refinement capabilities of diffusion models, we propose CycleRDM, a novel framework designed to unify restoration and enhancement tasks while achieving high-quality mapping. Specifically, CycleRDM first learns the mapping relationships among the degraded domain, the rough normal domain, and the normal domain through a two-stage diffusion inference process. Subsequently, we transfer the final calibration process to the wavelet low-frequency domain using discrete wavelet transform, performing fine-grained calibration from a frequency domain perspective by leveraging task-specific frequency spaces. To improve restoration quality, we design a feature gain module for the decomposed wavelet high-frequency domain to eliminate redundant features. Additionally, we employ multimodal textual prompts and Fourier transform to drive stable denoising and reduce randomness during the inference process. After extensive validation, CycleRDM can be effectively generalized to a wide range of image restoration and enhancement tasks while requiring only a small number of training samples to be significantly superior on various benchmarks of reconstruction quality and perceptual quality. The source code will be available at https://github.com/hejh8/CycleRDM. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.14630v1-abstract-full').style.display = 'none'; document.getElementById('2412.14630v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.13790">arXiv:2412.13790</a> <span> [<a href="https://arxiv.org/pdf/2412.13790">pdf</a>, <a href="https://arxiv.org/format/2412.13790">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Toward Efficient Data-Free Unlearning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chenhao Zhang</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+S">Shaofei Shen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Weitong Chen</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+M">Miao Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.13790v1-abstract-short" style="display: inline;"> Machine unlearning without access to real data distribution is challenging. The existing method based on data-free distillation achieved unlearning by filtering out synthetic samples containing forgetting information but struggled to distill the retaining-related knowledge efficiently. In this work, we analyze that such a problem is due to over-filtering, which reduces the synthesized retaining-re… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.13790v1-abstract-full').style.display = 'inline'; document.getElementById('2412.13790v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.13790v1-abstract-full" style="display: none;"> Machine unlearning without access to real data distribution is challenging. The existing method based on data-free distillation achieved unlearning by filtering out synthetic samples containing forgetting information but struggled to distill the retaining-related knowledge efficiently. In this work, we analyze that such a problem is due to over-filtering, which reduces the synthesized retaining-related information. We propose a novel method, Inhibited Synthetic PostFilter (ISPF), to tackle this challenge from two perspectives: First, the Inhibited Synthetic, by reducing the synthesized forgetting information; Second, the PostFilter, by fully utilizing the retaining-related information in synthesized samples. Experimental results demonstrate that the proposed ISPF effectively tackles the challenge and outperforms existing methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.13790v1-abstract-full').style.display = 'none'; document.getElementById('2412.13790v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages, 10 figures, accepted by AAAI 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.12801">arXiv:2412.12801</a> <span> [<a href="https://arxiv.org/pdf/2412.12801">pdf</a>, <a href="https://arxiv.org/format/2412.12801">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Multi-View Incremental Learning with Structured Hebbian Plasticity for Enhanced Fusion Efficiency </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yuhong Chen</a>, <a href="/search/cs?searchtype=author&query=Song%2C+A">Ailin Song</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+H">Huifeng Yin</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+S">Shuai Zhong</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+F">Fuhai Chen</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Q">Qi Xu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shiping Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+M">Mingkun Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.12801v1-abstract-short" style="display: inline;"> The rapid evolution of multimedia technology has revolutionized human perception, paving the way for multi-view learning. However, traditional multi-view learning approaches are tailored for scenarios with fixed data views, falling short of emulating the intricate cognitive procedures of the human brain processing signals sequentially. Our cerebral architecture seamlessly integrates sequential dat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.12801v1-abstract-full').style.display = 'inline'; document.getElementById('2412.12801v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.12801v1-abstract-full" style="display: none;"> The rapid evolution of multimedia technology has revolutionized human perception, paving the way for multi-view learning. However, traditional multi-view learning approaches are tailored for scenarios with fixed data views, falling short of emulating the intricate cognitive procedures of the human brain processing signals sequentially. Our cerebral architecture seamlessly integrates sequential data through intricate feed-forward and feedback mechanisms. In stark contrast, traditional methods struggle to generalize effectively when confronted with data spanning diverse domains, highlighting the need for innovative strategies that can mimic the brain's adaptability and dynamic integration capabilities. In this paper, we propose a bio-neurologically inspired multi-view incremental framework named MVIL aimed at emulating the brain's fine-grained fusion of sequentially arriving views. MVIL lies two fundamental modules: structured Hebbian plasticity and synaptic partition learning. The structured Hebbian plasticity reshapes the structure of weights to express the high correlation between view representations, facilitating a fine-grained fusion of view representations. Moreover, synaptic partition learning is efficient in alleviating drastic changes in weights and also retaining old knowledge by inhibiting partial synapses. These modules bionically play a central role in reinforcing crucial associations between newly acquired information and existing knowledge repositories, thereby enhancing the network's capacity for generalization. Experimental results on six benchmark datasets show MVIL's effectiveness over state-of-the-art methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.12801v1-abstract-full').style.display = 'none'; document.getElementById('2412.12801v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.12643">arXiv:2412.12643</a> <span> [<a href="https://arxiv.org/pdf/2412.12643">pdf</a>, <a href="https://arxiv.org/format/2412.12643">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> LLM-based Discriminative Reasoning for Knowledge Graph Question Answering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+M">Mufan Xu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+K">Kehai Chen</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+X">Xuefeng Bai</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+M">Muyun Yang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+T">Tiejun Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Min Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.12643v1-abstract-short" style="display: inline;"> Large language models (LLMs) based on generative pre-trained Transformer have achieved remarkable performance on knowledge graph question-answering (KGQA) tasks. However, LLMs often produce ungrounded subgraph planning or reasoning results in KGQA due to the hallucinatory behavior brought by the generative paradigm, which may hinder the advancement of the LLM-based KGQA model. To deal with the iss… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.12643v1-abstract-full').style.display = 'inline'; document.getElementById('2412.12643v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.12643v1-abstract-full" style="display: none;"> Large language models (LLMs) based on generative pre-trained Transformer have achieved remarkable performance on knowledge graph question-answering (KGQA) tasks. However, LLMs often produce ungrounded subgraph planning or reasoning results in KGQA due to the hallucinatory behavior brought by the generative paradigm, which may hinder the advancement of the LLM-based KGQA model. To deal with the issue, we propose a novel LLM-based Discriminative Reasoning (LDR) method to explicitly model the subgraph retrieval and answer inference process. By adopting discriminative strategies, the proposed LDR method not only enhances the capability of LLMs to retrieve question-related subgraphs but also alleviates the issue of ungrounded reasoning brought by the generative paradigm of LLMs. Experimental results show that the proposed approach outperforms multiple strong comparison methods, along with achieving state-of-the-art performance on two widely used WebQSP and CWQ benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.12643v1-abstract-full').style.display = 'none'; document.getElementById('2412.12643v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.12401">arXiv:2412.12401</a> <span> [<a href="https://arxiv.org/pdf/2412.12401">pdf</a>, <a href="https://arxiv.org/format/2412.12401">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Causally Consistent Normalizing Flow </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+Q">Qingyang Zhou</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+K">Kangjie Lu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+M">Meng Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.12401v1-abstract-short" style="display: inline;"> Causal inconsistency arises when the underlying causal graphs captured by generative models like \textit{Normalizing Flows} (NFs) are inconsistent with those specified in causal models like \textit{Struct Causal Models} (SCMs). This inconsistency can cause unwanted issues including the unfairness problem. Prior works to achieve causal consistency inevitably compromise the expressiveness of their m… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.12401v1-abstract-full').style.display = 'inline'; document.getElementById('2412.12401v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.12401v1-abstract-full" style="display: none;"> Causal inconsistency arises when the underlying causal graphs captured by generative models like \textit{Normalizing Flows} (NFs) are inconsistent with those specified in causal models like \textit{Struct Causal Models} (SCMs). This inconsistency can cause unwanted issues including the unfairness problem. Prior works to achieve causal consistency inevitably compromise the expressiveness of their models by disallowing hidden layers. In this work, we introduce a new approach: \textbf{C}ausally \textbf{C}onsistent \textbf{N}ormalizing \textbf{F}low (CCNF). To the best of our knowledge, CCNF is the first causally consistent generative model that can approximate any distribution with multiple layers. CCNF relies on two novel constructs: a sequential representation of SCMs and partial causal transformations. These constructs allow CCNF to inherently maintain causal consistency without sacrificing expressiveness. CCNF can handle all forms of causal inference tasks, including interventions and counterfactuals. Through experiments, we show that CCNF outperforms current approaches in causal inference. We also empirically validate the practical utility of CCNF by applying it to real-world datasets and show how CCNF addresses challenges like unfairness effectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.12401v1-abstract-full').style.display = 'none'; document.getElementById('2412.12401v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">extended version of "Causally Consistent Normalizing Flow" accepted by AAAI25</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.11768">arXiv:2412.11768</a> <span> [<a href="https://arxiv.org/pdf/2412.11768">pdf</a>, <a href="https://arxiv.org/format/2412.11768">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> No More Adam: Learning Rate Scaling at Initialization is All You Need </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+M">Minghao Xu</a>, <a href="/search/cs?searchtype=author&query=Xiang%2C+L">Lichuan Xiang</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+X">Xu Cai</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+H">Hongkai Wen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.11768v2-abstract-short" style="display: inline;"> In this work, we question the necessity of adaptive gradient methods for training deep neural networks. SGD-SaI is a simple yet effective enhancement to stochastic gradient descent with momentum (SGDM). SGD-SaI performs learning rate Scaling at Initialization (SaI) to distinct parameter groups, guided by their respective gradient signal-to-noise ratios (g-SNR). By adjusting learning rates without… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.11768v2-abstract-full').style.display = 'inline'; document.getElementById('2412.11768v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.11768v2-abstract-full" style="display: none;"> In this work, we question the necessity of adaptive gradient methods for training deep neural networks. SGD-SaI is a simple yet effective enhancement to stochastic gradient descent with momentum (SGDM). SGD-SaI performs learning rate Scaling at Initialization (SaI) to distinct parameter groups, guided by their respective gradient signal-to-noise ratios (g-SNR). By adjusting learning rates without relying on adaptive second-order momentum, SGD-SaI helps prevent training imbalances from the very first iteration and cuts the optimizer's memory usage by half compared to AdamW. Despite its simplicity and efficiency, SGD-SaI consistently matches or outperforms AdamW in training a variety of Transformer-based tasks, effectively overcoming a long-standing challenge of using SGD for training Transformers. SGD-SaI excels in ImageNet-1K classification with Vision Transformers(ViT) and GPT-2 pretraining for large language models (LLMs, transformer decoder-only), demonstrating robustness to hyperparameter variations and practicality for diverse applications. We further tested its robustness on tasks like LoRA fine-tuning for LLMs and diffusion models, where it consistently outperforms state-of-the-art optimizers. From a memory efficiency perspective, SGD-SaI achieves substantial memory savings for optimizer states, reducing memory usage by 5.93 GB for GPT-2 (1.5B parameters) and 25.15 GB for Llama2-7B compared to AdamW in full-precision training settings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.11768v2-abstract-full').style.display = 'none'; document.getElementById('2412.11768v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">20 pages, 10 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.11293">arXiv:2412.11293</a> <span> [<a href="https://arxiv.org/pdf/2412.11293">pdf</a>, <a href="https://arxiv.org/format/2412.11293">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> A Comparative Study on Dynamic Graph Embedding based on Mamba and Transformers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Pandey%2C+A+P">Ashish Parmanand Pandey</a>, <a href="/search/cs?searchtype=author&query=Varghese%2C+A+J">Alan John Varghese</a>, <a href="/search/cs?searchtype=author&query=Patil%2C+S">Sarang Patil</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+M">Mengjia Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.11293v1-abstract-short" style="display: inline;"> Dynamic graph embedding has emerged as an important technique for modeling complex time-evolving networks across diverse domains. While transformer-based models have shown promise in capturing long-range dependencies in temporal graph data, they face scalability challenges due to quadratic computational complexity. This study presents a comparative analysis of dynamic graph embedding approaches us… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.11293v1-abstract-full').style.display = 'inline'; document.getElementById('2412.11293v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.11293v1-abstract-full" style="display: none;"> Dynamic graph embedding has emerged as an important technique for modeling complex time-evolving networks across diverse domains. While transformer-based models have shown promise in capturing long-range dependencies in temporal graph data, they face scalability challenges due to quadratic computational complexity. This study presents a comparative analysis of dynamic graph embedding approaches using transformers and the recently proposed Mamba architecture, a state-space model with linear complexity. We introduce three novel models: TransformerG2G augment with graph convolutional networks, DG-Mamba, and GDG-Mamba with graph isomorphism network edge convolutions. Our experiments on multiple benchmark datasets demonstrate that Mamba-based models achieve comparable or superior performance to transformer-based approaches in link prediction tasks while offering significant computational efficiency gains on longer sequences. Notably, DG-Mamba variants consistently outperform transformer-based models on datasets with high temporal variability, such as UCI, Bitcoin, and Reality Mining, while maintaining competitive performance on more stable graphs like SBM. We provide insights into the learned temporal dependencies through analysis of attention weights and state matrices, revealing the models' ability to capture complex temporal patterns. By effectively combining state-space models with graph neural networks, our work addresses key limitations of previous approaches and contributes to the growing body of research on efficient temporal graph representation learning. These findings offer promising directions for scaling dynamic graph embedding to larger, more complex real-world networks, potentially enabling new applications in areas such as social network analysis, financial modeling, and biological system dynamics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.11293v1-abstract-full').style.display = 'none'; document.getElementById('2412.11293v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">18 pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.10961">arXiv:2412.10961</a> <span> [<a href="https://arxiv.org/pdf/2412.10961">pdf</a>, <a href="https://arxiv.org/format/2412.10961">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> PSMGD: Periodic Stochastic Multi-Gradient Descent for Fast Multi-Objective Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+M">Mingjing Xu</a>, <a href="/search/cs?searchtype=author&query=Ju%2C+P">Peizhong Ju</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jia Liu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+H">Haibo Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.10961v2-abstract-short" style="display: inline;"> Multi-objective optimization (MOO) lies at the core of many machine learning (ML) applications that involve multiple, potentially conflicting objectives (e.g., multi-task learning, multi-objective reinforcement learning, among many others). Despite the long history of MOO, recent years have witnessed a surge in interest within the ML community in the development of gradient manipulation algorithms… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.10961v2-abstract-full').style.display = 'inline'; document.getElementById('2412.10961v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.10961v2-abstract-full" style="display: none;"> Multi-objective optimization (MOO) lies at the core of many machine learning (ML) applications that involve multiple, potentially conflicting objectives (e.g., multi-task learning, multi-objective reinforcement learning, among many others). Despite the long history of MOO, recent years have witnessed a surge in interest within the ML community in the development of gradient manipulation algorithms for MOO, thanks to the availability of gradient information in many ML problems. However, existing gradient manipulation methods for MOO often suffer from long training times, primarily due to the need for computing dynamic weights by solving an additional optimization problem to determine a common descent direction that can decrease all objectives simultaneously. To address this challenge, we propose a new and efficient algorithm called Periodic Stochastic Multi-Gradient Descent (PSMGD) to accelerate MOO. PSMGD is motivated by the key observation that dynamic weights across objectives exhibit small changes under minor updates over short intervals during the optimization process. Consequently, our PSMGD algorithm is designed to periodically compute these dynamic weights and utilizes them repeatedly, thereby effectively reducing the computational overload. Theoretically, we prove that PSMGD can achieve state-of-the-art convergence rates for strongly-convex, general convex, and non-convex functions. Additionally, we introduce a new computational complexity measure, termed backpropagation complexity, and demonstrate that PSMGD could achieve an objective-independent backpropagation complexity. Through extensive experiments, we verify that PSMGD can provide comparable or superior performance to state-of-the-art MOO algorithms while significantly reducing training time. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.10961v2-abstract-full').style.display = 'none'; document.getElementById('2412.10961v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to AAAI 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.10872">arXiv:2412.10872</a> <span> [<a href="https://arxiv.org/pdf/2412.10872">pdf</a>, <a href="https://arxiv.org/format/2412.10872">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> IntelEX: A LLM-driven Attack-level Threat Intelligence Extraction Framework </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+M">Ming Xu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hongtai Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jiahao Liu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yun Lin</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+C+X+Y">Chenyang Xu Yingshi Liu</a>, <a href="/search/cs?searchtype=author&query=Lim%2C+H+W">Hoon Wei Lim</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+J+S">Jin Song Dong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.10872v1-abstract-short" style="display: inline;"> To combat increasingly sophisticated cyberattacks, a common practice is to transform unstructured cyber threat intelligence (CTI) reports into structured intelligence, facilitating threat-focused security tasks such as summarizing detection rules or simulating attack scenarios for red team exercises. </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.10872v1-abstract-full" style="display: none;"> To combat increasingly sophisticated cyberattacks, a common practice is to transform unstructured cyber threat intelligence (CTI) reports into structured intelligence, facilitating threat-focused security tasks such as summarizing detection rules or simulating attack scenarios for red team exercises. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.10872v1-abstract-full').style.display = 'none'; document.getElementById('2412.10872v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.08486">arXiv:2412.08486</a> <span> [<a href="https://arxiv.org/pdf/2412.08486">pdf</a>, <a href="https://arxiv.org/format/2412.08486">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Learning Flow Fields in Attention for Controllable Person Image Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Zijian Zhou</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Shikun Liu</a>, <a href="/search/cs?searchtype=author&query=Han%2C+X">Xiao Han</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Haozhe Liu</a>, <a href="/search/cs?searchtype=author&query=Ng%2C+K+W">Kam Woh Ng</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+T">Tian Xie</a>, <a href="/search/cs?searchtype=author&query=Cong%2C+Y">Yuren Cong</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Hang Li</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+M">Mengmeng Xu</a>, <a href="/search/cs?searchtype=author&query=P%C3%A9rez-R%C3%BAa%2C+J">Juan-Manuel P茅rez-R煤a</a>, <a href="/search/cs?searchtype=author&query=Patel%2C+A">Aditya Patel</a>, <a href="/search/cs?searchtype=author&query=Xiang%2C+T">Tao Xiang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+M">Miaojing Shi</a>, <a href="/search/cs?searchtype=author&query=He%2C+S">Sen He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.08486v2-abstract-short" style="display: inline;"> Controllable person image generation aims to generate a person image conditioned on reference images, allowing precise control over the person's appearance or pose. However, prior methods often distort fine-grained textural details from the reference image, despite achieving high overall image quality. We attribute these distortions to inadequate attention to corresponding regions in the reference… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.08486v2-abstract-full').style.display = 'inline'; document.getElementById('2412.08486v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.08486v2-abstract-full" style="display: none;"> Controllable person image generation aims to generate a person image conditioned on reference images, allowing precise control over the person's appearance or pose. However, prior methods often distort fine-grained textural details from the reference image, despite achieving high overall image quality. We attribute these distortions to inadequate attention to corresponding regions in the reference image. To address this, we thereby propose learning flow fields in attention (Leffa), which explicitly guides the target query to attend to the correct reference key in the attention layer during training. Specifically, it is realized via a regularization loss on top of the attention map within a diffusion-based baseline. Our extensive experiments show that Leffa achieves state-of-the-art performance in controlling appearance (virtual try-on) and pose (pose transfer), significantly reducing fine-grained detail distortion while maintaining high image quality. Additionally, we show that our loss is model-agnostic and can be used to improve the performance of other diffusion models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.08486v2-abstract-full').style.display = 'none'; document.getElementById('2412.08486v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">github: https://github.com/franciszzj/Leffa, demo: https://huggingface.co/spaces/franciszzj/Leffa, model: https://huggingface.co/franciszzj/Leffa</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.07289">arXiv:2412.07289</a> <span> [<a href="https://arxiv.org/pdf/2412.07289">pdf</a>, <a href="https://arxiv.org/format/2412.07289">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Relation Extraction via Supervised Rationale Verification and Feedback </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yongqi Li</a>, <a href="/search/cs?searchtype=author&query=Miao%2C+X">Xin Miao</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+S">Shen Zhou</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+M">Mayi Xu</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+Y">Yuyang Ren</a>, <a href="/search/cs?searchtype=author&query=Qian%2C+T">Tieyun Qian</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.07289v2-abstract-short" style="display: inline;"> Despite the rapid progress that existing automated feedback methods have made in correcting the output of large language models (LLMs), these methods cannot be well applied to the relation extraction (RE) task due to their designated feedback objectives and correction manner. To address this problem, we propose a novel automated feedback framework for RE, which presents a rationale supervisor to v… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.07289v2-abstract-full').style.display = 'inline'; document.getElementById('2412.07289v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.07289v2-abstract-full" style="display: none;"> Despite the rapid progress that existing automated feedback methods have made in correcting the output of large language models (LLMs), these methods cannot be well applied to the relation extraction (RE) task due to their designated feedback objectives and correction manner. To address this problem, we propose a novel automated feedback framework for RE, which presents a rationale supervisor to verify the rationale and provides re-selected demonstrations as feedback to correct the initial prediction. Specifically, we first design a causal intervention and observation method to collect biased/unbiased rationales for contrastive training the rationale supervisor. Then, we present a verification-feedback-correction procedure to iteratively enhance LLMs' capability of handling the RE task. Extensive experiments prove that our proposed framework significantly outperforms existing methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.07289v2-abstract-full').style.display = 'none'; document.getElementById('2412.07289v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to AAAI 2025, camera ready version</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Xu%2C+M&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Xu%2C+M&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Xu%2C+M&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Xu%2C+M&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Xu%2C+M&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Xu%2C+M&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository