Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 2,053 results for author: <span class="mathjax">Wang, Q</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Wang%2C+Q">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Wang, Q"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Wang%2C+Q&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Wang, Q"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Wang%2C+Q&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Wang%2C+Q&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Wang%2C+Q&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Wang%2C+Q&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Wang%2C+Q&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Wang%2C+Q&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14378">arXiv:2411.14378</a> <span> [<a href="https://arxiv.org/pdf/2411.14378">pdf</a>, <a href="https://arxiv.org/format/2411.14378">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Fluid Dynamics">physics.flu-dyn</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> CoNFiLD-inlet: Synthetic Turbulence Inflow Using Generative Latent Diffusion Models with Neural Fields </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xin-Yang Liu</a>, <a href="/search/cs?searchtype=author&query=Parikh%2C+M+H">Meet Hemant Parikh</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+X">Xiantao Fan</a>, <a href="/search/cs?searchtype=author&query=Du%2C+P">Pan Du</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qing Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yi-Fan Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jian-Xun Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14378v1-abstract-short" style="display: inline;"> Eddy-resolving turbulence simulations require stochastic inflow conditions that accurately replicate the complex, multi-scale structures of turbulence. Traditional recycling-based methods rely on computationally expensive precursor simulations, while existing synthetic inflow generators often fail to reproduce realistic coherent structures of turbulence. Recent advances in deep learning (DL) have… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14378v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14378v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14378v1-abstract-full" style="display: none;"> Eddy-resolving turbulence simulations require stochastic inflow conditions that accurately replicate the complex, multi-scale structures of turbulence. Traditional recycling-based methods rely on computationally expensive precursor simulations, while existing synthetic inflow generators often fail to reproduce realistic coherent structures of turbulence. Recent advances in deep learning (DL) have opened new possibilities for inflow turbulence generation, yet many DL-based methods rely on deterministic, autoregressive frameworks prone to error accumulation, resulting in poor robustness for long-term predictions. In this work, we present CoNFiLD-inlet, a novel DL-based inflow turbulence generator that integrates diffusion models with a conditional neural field (CNF)-encoded latent space to produce realistic, stochastic inflow turbulence. By parameterizing inflow conditions using Reynolds numbers, CoNFiLD-inlet generalizes effectively across a wide range of Reynolds numbers ($Re_蟿$ between $10^3$ and $10^4$) without requiring retraining or parameter tuning. Comprehensive validation through a priori and a posteriori tests in Direct Numerical Simulation (DNS) and Wall-Modeled Large Eddy Simulation (WMLES) demonstrates its high fidelity, robustness, and scalability, positioning it as an efficient and versatile solution for inflow turbulence synthesis. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14378v1-abstract-full').style.display = 'none'; document.getElementById('2411.14378v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">27 pages, 10 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14295">arXiv:2411.14295</a> <span> [<a href="https://arxiv.org/pdf/2411.14295">pdf</a>, <a href="https://arxiv.org/format/2411.14295">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> StereoCrafter-Zero: Zero-Shot Stereo Video Generation with Noisy Restart </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shi%2C+J">Jian Shi</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qian Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhenyu Li</a>, <a href="/search/cs?searchtype=author&query=Wonka%2C+P">Peter Wonka</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14295v1-abstract-short" style="display: inline;"> Generating high-quality stereo videos that mimic human binocular vision requires maintaining consistent depth perception and temporal coherence across frames. While diffusion models have advanced image and video synthesis, generating high-quality stereo videos remains challenging due to the difficulty of maintaining consistent temporal and spatial coherence between left and right views. We introdu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14295v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14295v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14295v1-abstract-full" style="display: none;"> Generating high-quality stereo videos that mimic human binocular vision requires maintaining consistent depth perception and temporal coherence across frames. While diffusion models have advanced image and video synthesis, generating high-quality stereo videos remains challenging due to the difficulty of maintaining consistent temporal and spatial coherence between left and right views. We introduce \textit{StereoCrafter-Zero}, a novel framework for zero-shot stereo video generation that leverages video diffusion priors without the need for paired training data. Key innovations include a noisy restart strategy to initialize stereo-aware latents and an iterative refinement process that progressively harmonizes the latent space, addressing issues like temporal flickering and view inconsistencies. Comprehensive evaluations, including quantitative metrics and user studies, demonstrate that \textit{StereoCrafter-Zero} produces high-quality stereo videos with improved depth consistency and temporal smoothness, even when depth estimations are imperfect. Our framework is robust and adaptable across various diffusion models, setting a new benchmark for zero-shot stereo video generation and enabling more immersive visual experiences. Our code can be found in~\url{https://github.com/shijianjian/StereoCrafter-Zero}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14295v1-abstract-full').style.display = 'none'; document.getElementById('2411.14295v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13949">arXiv:2411.13949</a> <span> [<a href="https://arxiv.org/pdf/2411.13949">pdf</a>, <a href="https://arxiv.org/format/2411.13949">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Separable Mixture of Low-Rank Adaptation for Continual Visual Instruction Tuning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Ziqi Wang</a>, <a href="/search/cs?searchtype=author&query=Che%2C+C">Chang Che</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qi Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yangyang Li</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Z">Zenglin Shi</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+M">Meng Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13949v1-abstract-short" style="display: inline;"> Visual instruction tuning (VIT) enables multimodal large language models (MLLMs) to effectively handle a wide range of vision tasks by framing them as language-based instructions. Building on this, continual visual instruction tuning (CVIT) extends the capability of MLLMs to incrementally learn new tasks, accommodating evolving functionalities. While prior work has advanced CVIT through the develo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13949v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13949v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13949v1-abstract-full" style="display: none;"> Visual instruction tuning (VIT) enables multimodal large language models (MLLMs) to effectively handle a wide range of vision tasks by framing them as language-based instructions. Building on this, continual visual instruction tuning (CVIT) extends the capability of MLLMs to incrementally learn new tasks, accommodating evolving functionalities. While prior work has advanced CVIT through the development of new benchmarks and approaches to mitigate catastrophic forgetting, these efforts largely follow traditional continual learning paradigms, neglecting the unique challenges specific to CVIT. We identify a dual form of catastrophic forgetting in CVIT, where MLLMs not only forget previously learned visual understanding but also experience a decline in instruction following abilities as they acquire new tasks. To address this, we introduce the Separable Mixture of Low-Rank Adaptation (SMoLoRA) framework, which employs separable routing through two distinct modules - one for visual understanding and another for instruction following. This dual-routing design enables specialized adaptation in both domains, preventing forgetting while improving performance. Furthermore, we propose a novel CVIT benchmark that goes beyond existing benchmarks by additionally evaluating a model's ability to generalize to unseen tasks and handle diverse instructions across various tasks. Extensive experiments demonstrate that SMoLoRA outperforms existing methods in mitigating dual forgetting, improving generalization to unseen tasks, and ensuring robustness in following diverse instructions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13949v1-abstract-full').style.display = 'none'; document.getElementById('2411.13949v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13587">arXiv:2411.13587</a> <span> [<a href="https://arxiv.org/pdf/2411.13587">pdf</a>, <a href="https://arxiv.org/format/2411.13587">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Exploring the Adversarial Vulnerabilities of Vision-Language-Action Models in Robotics </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+T">Taowen Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+D">Dongfang Liu</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+J+C">James Chenhao Liang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+W">Wenhao Yang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qifan Wang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+C">Cheng Han</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+J">Jiebo Luo</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+R">Ruixiang Tang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13587v1-abstract-short" style="display: inline;"> Recently in robotics, Vision-Language-Action (VLA) models have emerged as a transformative approach, enabling robots to execute complex tasks by integrating visual and linguistic inputs within an end-to-end learning framework. While VLA models offer significant capabilities, they also introduce new attack surfaces, making them vulnerable to adversarial attacks. With these vulnerabilities largely u… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13587v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13587v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13587v1-abstract-full" style="display: none;"> Recently in robotics, Vision-Language-Action (VLA) models have emerged as a transformative approach, enabling robots to execute complex tasks by integrating visual and linguistic inputs within an end-to-end learning framework. While VLA models offer significant capabilities, they also introduce new attack surfaces, making them vulnerable to adversarial attacks. With these vulnerabilities largely unexplored, this paper systematically quantifies the robustness of VLA-based robotic systems. Recognizing the unique demands of robotic execution, our attack objectives target the inherent spatial and functional characteristics of robotic systems. In particular, we introduce an untargeted position-aware attack objective that leverages spatial foundations to destabilize robotic actions, and a targeted attack objective that manipulates the robotic trajectory. Additionally, we design an adversarial patch generation approach that places a small, colorful patch within the camera's view, effectively executing the attack in both digital and physical environments. Our evaluation reveals a marked degradation in task success rates, with up to a 100\% reduction across a suite of simulated robotic tasks, highlighting critical security gaps in current VLA architectures. By unveiling these vulnerabilities and proposing actionable evaluation metrics, this work advances both the understanding and enhancement of safety for VLA-based robotic systems, underscoring the necessity for developing robust defense strategies prior to physical-world deployments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13587v1-abstract-full').style.display = 'none'; document.getElementById('2411.13587v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13056">arXiv:2411.13056</a> <span> [<a href="https://arxiv.org/pdf/2411.13056">pdf</a>, <a href="https://arxiv.org/format/2411.13056">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Efficient Masked AutoEncoder for Video Object Counting and A Large-Scale Benchmark </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cao%2C+B">Bing Cao</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Q">Quanhao Lu</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+J">Jiekang Feng</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+P">Pengfei Zhu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Q">Qinghua Hu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qilong Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13056v1-abstract-short" style="display: inline;"> The dynamic imbalance of the fore-background is a major challenge in video object counting, which is usually caused by the sparsity of foreground objects. This often leads to severe under- and over-prediction problems and has been less studied in existing works. To tackle this issue in video object counting, we propose a density-embedded Efficient Masked Autoencoder Counting (E-MAC) framework in t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13056v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13056v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13056v1-abstract-full" style="display: none;"> The dynamic imbalance of the fore-background is a major challenge in video object counting, which is usually caused by the sparsity of foreground objects. This often leads to severe under- and over-prediction problems and has been less studied in existing works. To tackle this issue in video object counting, we propose a density-embedded Efficient Masked Autoencoder Counting (E-MAC) framework in this paper. To effectively capture the dynamic variations across frames, we utilize an optical flow-based temporal collaborative fusion that aligns features to derive multi-frame density residuals. The counting accuracy of the current frame is boosted by harnessing the information from adjacent frames. More importantly, to empower the representation ability of dynamic foreground objects for intra-frame, we first take the density map as an auxiliary modality to perform $\mathtt{D}$ensity-$\mathtt{E}$mbedded $\mathtt{M}$asked m$\mathtt{O}$deling ($\mathtt{DEMO}$) for multimodal self-representation learning to regress density map. However, as $\mathtt{DEMO}$ contributes effective cross-modal regression guidance, it also brings in redundant background information and hard to focus on foreground regions. To handle this dilemma, we further propose an efficient spatial adaptive masking derived from density maps to boost efficiency. In addition, considering most existing datasets are limited to human-centric scenarios, we first propose a large video bird counting dataset $\textit{DroneBird}$, in natural scenarios for migratory bird protection. Extensive experiments on three crowd datasets and our $\textit{DroneBird}$ validate our superiority against the counterparts. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13056v1-abstract-full').style.display = 'none'; document.getElementById('2411.13056v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12701">arXiv:2411.12701</a> <span> [<a href="https://arxiv.org/pdf/2411.12701">pdf</a>, <a href="https://arxiv.org/format/2411.12701">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> When Backdoors Speak: Understanding LLM Backdoor Attacks Through Model-Generated Explanations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ge%2C+H">Huaizhi Ge</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yiming Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qifan Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yongfeng Zhang</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+R">Ruixiang Tang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12701v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) are vulnerable to backdoor attacks, where hidden triggers can maliciously manipulate model behavior. While several backdoor attack methods have been proposed, the mechanisms by which backdoor functions operate in LLMs remain underexplored. In this paper, we move beyond attacking LLMs and investigate backdoor functionality through the novel lens of natural language expl… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12701v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12701v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12701v1-abstract-full" style="display: none;"> Large Language Models (LLMs) are vulnerable to backdoor attacks, where hidden triggers can maliciously manipulate model behavior. While several backdoor attack methods have been proposed, the mechanisms by which backdoor functions operate in LLMs remain underexplored. In this paper, we move beyond attacking LLMs and investigate backdoor functionality through the novel lens of natural language explanations. Specifically, we leverage LLMs' generative capabilities to produce human-understandable explanations for their decisions, allowing us to compare explanations for clean and poisoned samples. We explore various backdoor attacks and embed the backdoor into LLaMA models for multiple tasks. Our experiments show that backdoored models produce higher-quality explanations for clean data compared to poisoned data, while generating significantly more consistent explanations for poisoned data than for clean data. We further analyze the explanation generation process, revealing that at the token level, the explanation token of poisoned samples only appears in the final few transformer layers of the LLM. At the sentence level, attention dynamics indicate that poisoned inputs shift attention from the input context when generating the explanation. These findings deepen our understanding of backdoor attack mechanisms in LLMs and offer a framework for detecting such vulnerabilities through explainability techniques, contributing to the development of more secure LLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12701v1-abstract-full').style.display = 'none'; document.getElementById('2411.12701v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12690">arXiv:2411.12690</a> <span> [<a href="https://arxiv.org/pdf/2411.12690">pdf</a>, <a href="https://arxiv.org/format/2411.12690">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computational Engineering, Finance, and Science">cs.CE</span> </div> </div> <p class="title is-5 mathjax"> MORE-Stress: Model Order Reduction based Efficient Numerical Algorithm for Thermal Stress Simulation of TSV Arrays in 2.5D/3D IC </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+T">Tianxiang Zhu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qipan Wang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yibo Lin</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+R">Runsheng Wang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+R">Ru Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12690v1-abstract-short" style="display: inline;"> Thermomechanical stress induced by through-silicon vias (TSVs) plays an important role in the performance and reliability analysis of 2.5D/3D ICs. While the finite element method (FEM) adopted by commercial software can provide accurate simulation results, it is very time- and memory-consuming for large-scale analysis. Over the past decade, the linear superposition method has been utilized to perf… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12690v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12690v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12690v1-abstract-full" style="display: none;"> Thermomechanical stress induced by through-silicon vias (TSVs) plays an important role in the performance and reliability analysis of 2.5D/3D ICs. While the finite element method (FEM) adopted by commercial software can provide accurate simulation results, it is very time- and memory-consuming for large-scale analysis. Over the past decade, the linear superposition method has been utilized to perform fast thermal stress estimations of TSV arrays, but it suffers from a lack of accuracy. In this paper, we propose MORE-Stress, a novel strict numerical algorithm for efficient thermal stress simulation of TSV arrays based on model order reduction. Extensive experimental results demonstrate that our algorithm can realize a 153-504 times reduction in computational time and a 39-115 times reduction in memory usage compared with the commercial software ANSYS, with negligible errors less than 1%. Our algorithm is as efficient as the linear superposition method, with an order of magnitude smaller errors and fast convergence. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12690v1-abstract-full').style.display = 'none'; document.getElementById('2411.12690v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">2025 Design, Automation & Test in Europe Conference & Exhibition (DATE)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11354">arXiv:2411.11354</a> <span> [<a href="https://arxiv.org/pdf/2411.11354">pdf</a>, <a href="https://arxiv.org/format/2411.11354">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> A comprehensive survey of oracle character recognition: challenges, benchmarks, and beyond </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+J">Jing Li</a>, <a href="/search/cs?searchtype=author&query=Chi%2C+X">Xueke Chi</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qiufeng Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+D">Dahan Wang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+K">Kaizhu Huang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yongge Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+C">Cheng-lin Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11354v1-abstract-short" style="display: inline;"> Oracle character recognition-an analysis of ancient Chinese inscriptions found on oracle bones-has become a pivotal field intersecting archaeology, paleography, and historical cultural studies. Traditional methods of oracle character recognition have relied heavily on manual interpretation by experts, which is not only labor-intensive but also limits broader accessibility to the general public. Wi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11354v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11354v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11354v1-abstract-full" style="display: none;"> Oracle character recognition-an analysis of ancient Chinese inscriptions found on oracle bones-has become a pivotal field intersecting archaeology, paleography, and historical cultural studies. Traditional methods of oracle character recognition have relied heavily on manual interpretation by experts, which is not only labor-intensive but also limits broader accessibility to the general public. With recent breakthroughs in pattern recognition and deep learning, there is a growing movement towards the automation of oracle character recognition (OrCR), showing considerable promise in tackling the challenges inherent to these ancient scripts. However, a comprehensive understanding of OrCR still remains elusive. Therefore, this paper presents a systematic and structured survey of the current landscape of OrCR research. We commence by identifying and analyzing the key challenges of OrCR. Then, we provide an overview of the primary benchmark datasets and digital resources available for OrCR. A review of contemporary research methodologies follows, in which their respective efficacies, limitations, and applicability to the complex nature of oracle characters are critically highlighted and examined. Additionally, our review extends to ancillary tasks associated with OrCR across diverse disciplines, providing a broad-spectrum analysis of its applications. We conclude with a forward-looking perspective, proposing potential avenues for future investigations that could yield significant advancements in the field. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11354v1-abstract-full').style.display = 'none'; document.getElementById('2411.11354v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10693">arXiv:2411.10693</a> <span> [<a href="https://arxiv.org/pdf/2411.10693">pdf</a>, <a href="https://arxiv.org/format/2411.10693">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Multi-perspective Contrastive Logit Distillation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qi Wang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jinjia Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10693v1-abstract-short" style="display: inline;"> We propose a novel and efficient logit distillation method, Multi-perspective Contrastive Logit Distillation (MCLD), which leverages contrastive learning to distill logits from multiple perspectives in knowledge distillation. Recent research on logit distillation has primarily focused on maximizing the information learned from the teacher model's logits to enhance the performance of the student mo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10693v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10693v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10693v1-abstract-full" style="display: none;"> We propose a novel and efficient logit distillation method, Multi-perspective Contrastive Logit Distillation (MCLD), which leverages contrastive learning to distill logits from multiple perspectives in knowledge distillation. Recent research on logit distillation has primarily focused on maximizing the information learned from the teacher model's logits to enhance the performance of the student model. To this end, we propose MCLD, which consists of three key components: Instance-wise CLD, Sample-wise CLD, and Category-wise CLD. These components are designed to facilitate the transfer of more information from the teacher's logits to the student model. Comprehensive evaluations on image classification tasks using CIFAR-100 and ImageNet, alongside representation transferability assessments on STL-10 and Tiny-ImageNet, highlight the significant advantages of our method. The knowledge distillation with our MCLD, surpasses existing state-of-the-art methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10693v1-abstract-full').style.display = 'none'; document.getElementById('2411.10693v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 6 figures, 11 tabels, 9 formulas, including pseudo-code</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10484">arXiv:2411.10484</a> <span> [<a href="https://arxiv.org/pdf/2411.10484">pdf</a>, <a href="https://arxiv.org/format/2411.10484">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Data Structures and Algorithms">cs.DS</span> </div> </div> <p class="title is-5 mathjax"> iFlow: An Interactive Max-Flow/Min-Cut Algorithms Visualizer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ye%2C+M">Muyang Ye</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+T">Tianrui Xia</a>, <a href="/search/cs?searchtype=author&query=Zu%2C+T">Tianxin Zu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qian Wang</a>, <a href="/search/cs?searchtype=author&query=Kempe%2C+D">David Kempe</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10484v1-abstract-short" style="display: inline;"> The Max-Flow/Min-Cut problem is a fundamental tool in graph theory, with applications in many domains, including data mining, image segmentation, transportation planning, and many types of assignment problems, in addition to being an essential building block for many other algorithms. The Ford-Fulkerson Algorithm for Max-Flow/Min-Cut and its variants are therefore commonly taught in undergraduate… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10484v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10484v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10484v1-abstract-full" style="display: none;"> The Max-Flow/Min-Cut problem is a fundamental tool in graph theory, with applications in many domains, including data mining, image segmentation, transportation planning, and many types of assignment problems, in addition to being an essential building block for many other algorithms. The Ford-Fulkerson Algorithm for Max-Flow/Min-Cut and its variants are therefore commonly taught in undergraduate and beginning graduate algorithms classes. However, these algorithms -- and in particular the so-called residual graphs they utilize -- often pose significant challenges for students. To help students achieve a deeper understanding, we developed iFlow, an interactive visualization tool for the Ford-Fulkerson Algorithm and its variants. iFlow lets users design or import flow networks, and execute the algorithm by hand. In particular, the user can select an augmentation path and amount, and then update the residual graph. The user is given detailed feedback on mistakes, and can also have iFlow auto-complete each step, to use it as a demonstration tool while still in the initial learning stages. iFlow has been made publicly available and open-sourced. We deployed iFlow in an undergraduate algorithms class, and collected students' self-reported learning benefits via an optional survey. All respondents considered the tool at least somewhat useful and engaging, with most rating it either as useful/engaging or very useful/engaging. Students also generally reported a significant increase in understanding of the algorithm. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10484v1-abstract-full').style.display = 'none'; document.getElementById('2411.10484v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper is accepted by SIGCSE 2025 TS. Due to the page limit we can not include the appendix in the SIGCSE version. So we decide to include them on arXiv so that the SIGCSE version can point to the arXiv version. Since the final SIGCSE version is due by Nov. 17, it would be really helpful if this submission can go online as soon as possible. Thanks!</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10060">arXiv:2411.10060</a> <span> [<a href="https://arxiv.org/pdf/2411.10060">pdf</a>, <a href="https://arxiv.org/format/2411.10060">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> CMATH: Cross-Modality Augmented Transformer with Hierarchical Variational Distillation for Multimodal Emotion Recognition in Conversation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+X">Xiaofei Zhu</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+J">Jiawei Cheng</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhou Yang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhuo Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qingyang Wang</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+J">Jianfeng Yao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10060v1-abstract-short" style="display: inline;"> Multimodal emotion recognition in conversation (MER) aims to accurately identify emotions in conversational utterances by integrating multimodal information. Previous methods usually treat multimodal information as equal quality and employ symmetric architectures to conduct multimodal fusion. However, in reality, the quality of different modalities usually varies considerably, and utilizing a symm… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10060v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10060v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10060v1-abstract-full" style="display: none;"> Multimodal emotion recognition in conversation (MER) aims to accurately identify emotions in conversational utterances by integrating multimodal information. Previous methods usually treat multimodal information as equal quality and employ symmetric architectures to conduct multimodal fusion. However, in reality, the quality of different modalities usually varies considerably, and utilizing a symmetric architecture is difficult to accurately recognize conversational emotions when dealing with uneven modal information. Furthermore, fusing multi-modality information in a single granularity may fail to adequately integrate modal information, exacerbating the inaccuracy in emotion recognition. In this paper, we propose a novel Cross-Modality Augmented Transformer with Hierarchical Variational Distillation, called CMATH, which consists of two major components, i.e., Multimodal Interaction Fusion and Hierarchical Variational Distillation. The former is comprised of two submodules, including Modality Reconstruction and Cross-Modality Augmented Transformer (CMA-Transformer), where Modality Reconstruction focuses on obtaining high-quality compressed representation of each modality, and CMA-Transformer adopts an asymmetric fusion strategy which treats one modality as the central modality and takes others as auxiliary modalities. The latter first designs a variational fusion network to fuse the fine-grained representations learned by CMA- Transformer into a coarse-grained representations. Then, it introduces a hierarchical distillation framework to maintain the consistency between modality representations with different granularities. Experiments on the IEMOCAP and MELD datasets demonstrate that our proposed model outperforms previous state-of-the-art baselines. Implementation codes can be available at https://github.com/ cjw-MER/CMATH. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10060v1-abstract-full').style.display = 'none'; document.getElementById('2411.10060v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09953">arXiv:2411.09953</a> <span> [<a href="https://arxiv.org/pdf/2411.09953">pdf</a>, <a href="https://arxiv.org/format/2411.09953">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Brain-inspired Action Generation with Spiking Transformer Diffusion Policy Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qianhao Wang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yinqian Sun</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+E">Enmeng Lu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Qian Zhang</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+Y">Yi Zeng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09953v1-abstract-short" style="display: inline;"> Spiking Neural Networks (SNNs) has the ability to extract spatio-temporal features due to their spiking sequence. While previous research has primarily foucus on the classification of image and reinforcement learning. In our paper, we put forward novel diffusion policy model based on Spiking Transformer Neural Networks and Denoising Diffusion Probabilistic Model (DDPM): Spiking Transformer Modulat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09953v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09953v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09953v1-abstract-full" style="display: none;"> Spiking Neural Networks (SNNs) has the ability to extract spatio-temporal features due to their spiking sequence. While previous research has primarily foucus on the classification of image and reinforcement learning. In our paper, we put forward novel diffusion policy model based on Spiking Transformer Neural Networks and Denoising Diffusion Probabilistic Model (DDPM): Spiking Transformer Modulate Diffusion Policy Model (STMDP), a new brain-inspired model for generating robot action trajectories. In order to improve the performance of this model, we develop a novel decoder module: Spiking Modulate De coder (SMD), which replaces the traditional Decoder module within the Transformer architecture. Additionally, we explored the substitution of DDPM with Denoising Diffusion Implicit Models (DDIM) in our frame work. We conducted experiments across four robotic manipulation tasks and performed ablation studies on the modulate block. Our model consistently outperforms existing Transformer-based diffusion policy method. Especially in Can task, we achieved an improvement of 8%. The proposed STMDP method integrates SNNs, dffusion model and Transformer architecture, which offers new perspectives and promising directions for exploration in brain-inspired robotics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09953v1-abstract-full').style.display = 'none'; document.getElementById('2411.09953v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 4 figures and 2 tables, conference submission</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68Q25 <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.9 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09703">arXiv:2411.09703</a> <span> [<a href="https://arxiv.org/pdf/2411.09703">pdf</a>, <a href="https://arxiv.org/format/2411.09703">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MagicQuill: An Intelligent Interactive Image Editing System </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zichen Liu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Y">Yue Yu</a>, <a href="/search/cs?searchtype=author&query=Ouyang%2C+H">Hao Ouyang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qiuyu Wang</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+K+L">Ka Leong Cheng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wen Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhiheng Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Q">Qifeng Chen</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+Y">Yujun Shen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09703v1-abstract-short" style="display: inline;"> Image editing involves a variety of complex tasks and requires efficient and precise manipulation techniques. In this paper, we present MagicQuill, an integrated image editing system that enables swift actualization of creative ideas. Our system features a streamlined yet functionally robust interface, allowing for the articulation of editing operations (e.g., inserting elements, erasing objects,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09703v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09703v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09703v1-abstract-full" style="display: none;"> Image editing involves a variety of complex tasks and requires efficient and precise manipulation techniques. In this paper, we present MagicQuill, an integrated image editing system that enables swift actualization of creative ideas. Our system features a streamlined yet functionally robust interface, allowing for the articulation of editing operations (e.g., inserting elements, erasing objects, altering color) with minimal input. These interactions are monitored by a multimodal large language model (MLLM) to anticipate editing intentions in real time, bypassing the need for explicit prompt entry. Finally, we apply a powerful diffusion prior, enhanced by a carefully learned two-branch plug-in module, to process editing requests with precise control. Experimental results demonstrate the effectiveness of MagicQuill in achieving high-quality image edits. Please visit https://magic-quill.github.io to try out our system. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09703v1-abstract-full').style.display = 'none'; document.getElementById('2411.09703v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Code and demo available at https://magic-quill.github.io</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07934">arXiv:2411.07934</a> <span> [<a href="https://arxiv.org/pdf/2411.07934">pdf</a>, <a href="https://arxiv.org/format/2411.07934">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Doubly Mild Generalization for Offline Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mao%2C+Y">Yixiu Mao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qi Wang</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+Y">Yun Qu</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Y">Yuhang Jiang</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+X">Xiangyang Ji</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07934v2-abstract-short" style="display: inline;"> Offline Reinforcement Learning (RL) suffers from the extrapolation error and value overestimation. From a generalization perspective, this issue can be attributed to the over-generalization of value functions or policies towards out-of-distribution (OOD) actions. Significant efforts have been devoted to mitigating such generalization, and recent in-sample learning approaches have further succeeded… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07934v2-abstract-full').style.display = 'inline'; document.getElementById('2411.07934v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07934v2-abstract-full" style="display: none;"> Offline Reinforcement Learning (RL) suffers from the extrapolation error and value overestimation. From a generalization perspective, this issue can be attributed to the over-generalization of value functions or policies towards out-of-distribution (OOD) actions. Significant efforts have been devoted to mitigating such generalization, and recent in-sample learning approaches have further succeeded in entirely eschewing it. Nevertheless, we show that mild generalization beyond the dataset can be trusted and leveraged to improve performance under certain conditions. To appropriately exploit generalization in offline RL, we propose Doubly Mild Generalization (DMG), comprising (i) mild action generalization and (ii) mild generalization propagation. The former refers to selecting actions in a close neighborhood of the dataset to maximize the Q values. Even so, the potential erroneous generalization can still be propagated, accumulated, and exacerbated by bootstrapping. In light of this, the latter concept is introduced to mitigate the generalization propagation without impeding the propagation of RL learning signals. Theoretically, DMG guarantees better performance than the in-sample optimal policy in the oracle generalization scenario. Even under worst-case generalization, DMG can still control value overestimation at a certain level and lower bound the performance. Empirically, DMG achieves state-of-the-art performance across Gym-MuJoCo locomotion tasks and challenging AntMaze tasks. Moreover, benefiting from its flexibility in both generalization aspects, DMG enjoys a seamless transition from offline to online learning and attains strong online fine-tuning performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07934v2-abstract-full').style.display = 'none'; document.getElementById('2411.07934v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06660">arXiv:2411.06660</a> <span> [<a href="https://arxiv.org/pdf/2411.06660">pdf</a>, <a href="https://arxiv.org/format/2411.06660">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Bridge: A Unified Framework to Knowledge Graph Completion via Language Models and Knowledge Representation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qiao%2C+Q">Qiao Qiao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yuepei Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qing Wang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+K">Kang Zhou</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Q">Qi Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06660v1-abstract-short" style="display: inline;"> Knowledge graph completion (KGC) is a task of inferring missing triples based on existing Knowledge Graphs (KGs). Both structural and semantic information are vital for successful KGC. However, existing methods only use either the structural knowledge from the KG embeddings or the semantic information from pre-trained language models (PLMs), leading to suboptimal model performance. Moreover, since… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06660v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06660v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06660v1-abstract-full" style="display: none;"> Knowledge graph completion (KGC) is a task of inferring missing triples based on existing Knowledge Graphs (KGs). Both structural and semantic information are vital for successful KGC. However, existing methods only use either the structural knowledge from the KG embeddings or the semantic information from pre-trained language models (PLMs), leading to suboptimal model performance. Moreover, since PLMs are not trained on KGs, directly using PLMs to encode triples may be inappropriate. To overcome these limitations, we propose a novel framework called Bridge, which jointly encodes structural and semantic information of KGs. Specifically, we strategically encode entities and relations separately by PLMs to better utilize the semantic knowledge of PLMs and enable structured representation learning via a structural learning principle. Furthermore, to bridge the gap between KGs and PLMs, we employ a self-supervised representation learning method called BYOL to fine-tune PLMs with two different views of a triple. Unlike BYOL, which uses augmentation methods to create two semantically similar views of the same image, potentially altering the semantic information. We strategically separate the triple into two parts to create different views, thus avoiding semantic alteration. Experiments demonstrate that Bridge outperforms the SOTA models on three benchmark datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06660v1-abstract-full').style.display = 'none'; document.getElementById('2411.06660v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06558">arXiv:2411.06558</a> <span> [<a href="https://arxiv.org/pdf/2411.06558">pdf</a>, <a href="https://arxiv.org/format/2411.06558">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Region-Aware Text-to-Image Generation via Hard Binding and Soft Refinement </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhennan Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yajie Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haofan Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhibo Chen</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Zhengkai Jiang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jun Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qian Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jian Yang</a>, <a href="/search/cs?searchtype=author&query=Tai%2C+Y">Ying Tai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06558v2-abstract-short" style="display: inline;"> Regional prompting, or compositional generation, which enables fine-grained spatial control, has gained increasing attention for its practicality in real-world applications. However, previous methods either introduce additional trainable modules, thus only applicable to specific models, or manipulate on score maps within cross-attention layers using attention masks, resulting in limited control st… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06558v2-abstract-full').style.display = 'inline'; document.getElementById('2411.06558v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06558v2-abstract-full" style="display: none;"> Regional prompting, or compositional generation, which enables fine-grained spatial control, has gained increasing attention for its practicality in real-world applications. However, previous methods either introduce additional trainable modules, thus only applicable to specific models, or manipulate on score maps within cross-attention layers using attention masks, resulting in limited control strength when the number of regions increases. To handle these limitations, we present RAG, a Regional-Aware text-to-image Generation method conditioned on regional descriptions for precise layout composition. RAG decouple the multi-region generation into two sub-tasks, the construction of individual region (Regional Hard Binding) that ensures the regional prompt is properly executed, and the overall detail refinement (Regional Soft Refinement) over regions that dismiss the visual boundaries and enhance adjacent interactions. Furthermore, RAG novelly makes repainting feasible, where users can modify specific unsatisfied regions in the last generation while keeping all other regions unchanged, without relying on additional inpainting models. Our approach is tuning-free and applicable to other frameworks as an enhancement to the prompt following property. Quantitative and qualitative experiments demonstrate that RAG achieves superior performance over attribute binding and object relationship than previous tuning-free methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06558v2-abstract-full').style.display = 'none'; document.getElementById('2411.06558v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Code is available at https://github.com/NJU-PCALab/RAG-Diffusion</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06106">arXiv:2411.06106</a> <span> [<a href="https://arxiv.org/pdf/2411.06106">pdf</a>, <a href="https://arxiv.org/format/2411.06106">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Personalize to generalize: Towards a universal medical multi-modality generalization through personalization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tan%2C+Z">Zhaorui Tan</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xi Yang</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+T">Tan Pan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+T">Tianyi Liu</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+C">Chen Jiang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+X">Xin Guo</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qiufeng Wang</a>, <a href="/search/cs?searchtype=author&query=Nguyen%2C+A">Anh Nguyen</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+Y">Yuan Qi</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+K">Kaizhu Huang</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+Y">Yuan Cheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06106v2-abstract-short" style="display: inline;"> The differences among medical imaging modalities, driven by distinct underlying principles, pose significant challenges for generalization in multi-modal medical tasks. Beyond modality gaps, individual variations, such as differences in organ size and metabolic rate, further impede a model's ability to generalize effectively across both modalities and diverse populations. Despite the importance of… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06106v2-abstract-full').style.display = 'inline'; document.getElementById('2411.06106v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06106v2-abstract-full" style="display: none;"> The differences among medical imaging modalities, driven by distinct underlying principles, pose significant challenges for generalization in multi-modal medical tasks. Beyond modality gaps, individual variations, such as differences in organ size and metabolic rate, further impede a model's ability to generalize effectively across both modalities and diverse populations. Despite the importance of personalization, existing approaches to multi-modal generalization often neglect individual differences, focusing solely on common anatomical features. This limitation may result in weakened generalization in various medical tasks. In this paper, we unveil that personalization is critical for multi-modal generalization. Specifically, we propose an approach to achieve personalized generalization through approximating the underlying personalized invariant representation ${X}_h$ across various modalities by leveraging individual-level constraints and a learnable biological prior. We validate the feasibility and benefits of learning a personalized ${X}_h$, showing that this representation is highly generalizable and transferable across various multi-modal medical tasks. Extensive experimental results consistently show that the additionally incorporated personalization significantly improves performance and generalization across diverse scenarios, confirming its effectiveness. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06106v2-abstract-full').style.display = 'none'; document.getElementById('2411.06106v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05824">arXiv:2411.05824</a> <span> [<a href="https://arxiv.org/pdf/2411.05824">pdf</a>, <a href="https://arxiv.org/format/2411.05824">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Navigating Distribution Shifts in Medical Image Analysis: A Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Su%2C+Z">Zixian Su</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+J">Jingwei Guo</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xi Yang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qiufeng Wang</a>, <a href="/search/cs?searchtype=author&query=Coenen%2C+F">Frans Coenen</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+K">Kaizhu Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05824v1-abstract-short" style="display: inline;"> Medical Image Analysis (MedIA) has become indispensable in modern healthcare, enhancing clinical diagnostics and personalized treatment. Despite the remarkable advancements supported by deep learning (DL) technologies, their practical deployment faces challenges due to distribution shifts, where models trained on specific datasets underperform across others from varying hospitals, regions, or pati… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05824v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05824v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05824v1-abstract-full" style="display: none;"> Medical Image Analysis (MedIA) has become indispensable in modern healthcare, enhancing clinical diagnostics and personalized treatment. Despite the remarkable advancements supported by deep learning (DL) technologies, their practical deployment faces challenges due to distribution shifts, where models trained on specific datasets underperform across others from varying hospitals, regions, or patient populations. To navigate this issue, researchers have been actively developing strategies to increase the adaptability and robustness of DL models, enabling their effective use in unfamiliar and diverse environments. This paper systematically reviews approaches that apply DL techniques to MedIA systems affected by distribution shifts. Unlike traditional categorizations based on technical specifications, our approach is grounded in the real-world operational constraints faced by healthcare institutions. Specifically, we categorize the existing body of work into Joint Training, Federated Learning, Fine-tuning, and Domain Generalization, with each method tailored to distinct scenarios caused by Data Accessibility, Privacy Concerns, and Collaborative Protocols. This perspective equips researchers with a nuanced understanding of how DL can be strategically deployed to address distribution shifts in MedIA, ensuring diverse and robust medical applications. By delving deeper into these topics, we highlight potential pathways for future research that not only address existing limitations but also push the boundaries of deployable MedIA technologies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05824v1-abstract-full').style.display = 'none'; document.getElementById('2411.05824v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03925">arXiv:2411.03925</a> <span> [<a href="https://arxiv.org/pdf/2411.03925">pdf</a>, <a href="https://arxiv.org/ps/2411.03925">ps</a>, <a href="https://arxiv.org/format/2411.03925">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Quantum Physics">quant-ph</span> </div> </div> <p class="title is-5 mathjax"> Quantum Algorithm for Sparse Online Learning with Truncated Gradient Descent </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lim%2C+D">Debbie Lim</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+Y">Yixian Qiu</a>, <a href="/search/cs?searchtype=author&query=Rebentrost%2C+P">Patrick Rebentrost</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qisheng Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03925v1-abstract-short" style="display: inline;"> Logistic regression, the Support Vector Machine (SVM), and least squares are well-studied methods in the statistical and computer science community, with various practical applications. High-dimensional data arriving on a real-time basis makes the design of online learning algorithms that produce sparse solutions essential. The seminal work of \hyperlink{cite.langford2009sparse}{Langford, Li, and… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03925v1-abstract-full').style.display = 'inline'; document.getElementById('2411.03925v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03925v1-abstract-full" style="display: none;"> Logistic regression, the Support Vector Machine (SVM), and least squares are well-studied methods in the statistical and computer science community, with various practical applications. High-dimensional data arriving on a real-time basis makes the design of online learning algorithms that produce sparse solutions essential. The seminal work of \hyperlink{cite.langford2009sparse}{Langford, Li, and Zhang (2009)} developed a method to obtain sparsity via truncated gradient descent, showing a near-optimal online regret bound. Based on this method, we develop a quantum sparse online learning algorithm for logistic regression, the SVM, and least squares. Given efficient quantum access to the inputs, we show that a quadratic speedup in the time complexity with respect to the dimension of the problem is achievable, while maintaining a regret of $O(1/\sqrt{T})$, where $T$ is the number of iterations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03925v1-abstract-full').style.display = 'none'; document.getElementById('2411.03925v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">31 pages, 1 table, 4 algorithms</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03862">arXiv:2411.03862</a> <span> [<a href="https://arxiv.org/pdf/2411.03862">pdf</a>, <a href="https://arxiv.org/format/2411.03862">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> ROBIN: Robust and Invisible Watermarks for Diffusion Models with Adversarial Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+H">Huayang Huang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yu Wu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qian Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03862v1-abstract-short" style="display: inline;"> Watermarking generative content serves as a vital tool for authentication, ownership protection, and mitigation of potential misuse. Existing watermarking methods face the challenge of balancing robustness and concealment. They empirically inject a watermark that is both invisible and robust and passively achieve concealment by limiting the strength of the watermark, thus reducing the robustness.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03862v1-abstract-full').style.display = 'inline'; document.getElementById('2411.03862v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03862v1-abstract-full" style="display: none;"> Watermarking generative content serves as a vital tool for authentication, ownership protection, and mitigation of potential misuse. Existing watermarking methods face the challenge of balancing robustness and concealment. They empirically inject a watermark that is both invisible and robust and passively achieve concealment by limiting the strength of the watermark, thus reducing the robustness. In this paper, we propose to explicitly introduce a watermark hiding process to actively achieve concealment, thus allowing the embedding of stronger watermarks. To be specific, we implant a robust watermark in an intermediate diffusion state and then guide the model to hide the watermark in the final generated image. We employ an adversarial optimization algorithm to produce the optimal hiding prompt guiding signal for each watermark. The prompt embedding is optimized to minimize artifacts in the generated image, while the watermark is optimized to achieve maximum strength. The watermark can be verified by reversing the generation process. Experiments on various diffusion models demonstrate the watermark remains verifiable even under significant image tampering and shows superior invisibility compared to other state-of-the-art robust watermarking methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03862v1-abstract-full').style.display = 'none'; document.getElementById('2411.03862v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accept to NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02847">arXiv:2411.02847</a> <span> [<a href="https://arxiv.org/pdf/2411.02847">pdf</a>, <a href="https://arxiv.org/format/2411.02847">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Dissecting the Failure of Invariant Learning on Graphs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qixun Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yifei Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yisen Wang</a>, <a href="/search/cs?searchtype=author&query=Ying%2C+X">Xianghua Ying</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02847v2-abstract-short" style="display: inline;"> Enhancing node-level Out-Of-Distribution (OOD) generalization on graphs remains a crucial area of research. In this paper, we develop a Structural Causal Model (SCM) to theoretically dissect the performance of two prominent invariant learning methods -- Invariant Risk Minimization (IRM) and Variance-Risk Extrapolation (VREx) -- in node-level OOD settings. Our analysis reveals a critical limitation… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02847v2-abstract-full').style.display = 'inline'; document.getElementById('2411.02847v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02847v2-abstract-full" style="display: none;"> Enhancing node-level Out-Of-Distribution (OOD) generalization on graphs remains a crucial area of research. In this paper, we develop a Structural Causal Model (SCM) to theoretically dissect the performance of two prominent invariant learning methods -- Invariant Risk Minimization (IRM) and Variance-Risk Extrapolation (VREx) -- in node-level OOD settings. Our analysis reveals a critical limitation: due to the lack of class-conditional invariance constraints, these methods may struggle to accurately identify the structure of the predictive invariant ego-graph and consequently rely on spurious features. To address this, we propose Cross-environment Intra-class Alignment (CIA), which explicitly eliminates spurious features by aligning cross-environment representations conditioned on the same class, bypassing the need for explicit knowledge of the causal pattern structure. To adapt CIA to node-level OOD scenarios where environment labels are hard to obtain, we further propose CIA-LRA (Localized Reweighting Alignment) that leverages the distribution of neighboring labels to selectively align node representations, effectively distinguishing and preserving invariant features while removing spurious ones, all without relying on environment labels. We theoretically prove CIA-LRA's effectiveness by deriving an OOD generalization error bound based on PAC-Bayesian analysis. Experiments on graph OOD benchmarks validate the superiority of CIA and CIA-LRA, marking a significant advancement in node-level OOD generalization. The codes are available at https://github.com/NOVAglow646/NeurIPS24-Invariant-Learning-on-Graphs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02847v2-abstract-full').style.display = 'none'; document.getElementById('2411.02847v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02794">arXiv:2411.02794</a> <span> [<a href="https://arxiv.org/pdf/2411.02794">pdf</a>, <a href="https://arxiv.org/format/2411.02794">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Real-Time Text Detection with Similar Mask in Traffic, Industrial, and Natural Scenes </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Han%2C+X">Xu Han</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+J">Junyu Gao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+C">Chuang Yang</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Yuan Yuan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qi Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02794v1-abstract-short" style="display: inline;"> Texts on the intelligent transportation scene include mass information. Fully harnessing this information is one of the critical drivers for advancing intelligent transportation. Unlike the general scene, detecting text in transportation has extra demand, such as a fast inference speed, except for high accuracy. Most existing real-time text detection methods are based on the shrink mask, which los… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02794v1-abstract-full').style.display = 'inline'; document.getElementById('2411.02794v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02794v1-abstract-full" style="display: none;"> Texts on the intelligent transportation scene include mass information. Fully harnessing this information is one of the critical drivers for advancing intelligent transportation. Unlike the general scene, detecting text in transportation has extra demand, such as a fast inference speed, except for high accuracy. Most existing real-time text detection methods are based on the shrink mask, which loses some geometry semantic information and needs complex post-processing. In addition, the previous method usually focuses on correct output, which ignores feature correction and lacks guidance during the intermediate process. To this end, we propose an efficient multi-scene text detector that contains an effective text representation similar mask (SM) and a feature correction module (FCM). Unlike previous methods, the former aims to preserve the geometric information of the instances as much as possible. Its post-progressing saves 50$\%$ of the time, accurately and efficiently reconstructing text contours. The latter encourages false positive features to move away from the positive feature center, optimizing the predictions from the feature level. Some ablation studies demonstrate the efficiency of the SM and the effectiveness of the FCM. Moreover, the deficiency of existing traffic datasets (such as the low-quality annotation or closed source data unavailability) motivated us to collect and annotate a traffic text dataset, which introduces motion blur. In addition, to validate the scene robustness of the SM-Net, we conduct experiments on traffic, industrial, and natural scene datasets. Extensive experiments verify it achieves (SOTA) performance on several benchmarks. The code and dataset are available at: \url{https://github.com/fengmulin/SMNet}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02794v1-abstract-full').style.display = 'none'; document.getElementById('2411.02794v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02265">arXiv:2411.02265</a> <span> [<a href="https://arxiv.org/pdf/2411.02265">pdf</a>, <a href="https://arxiv.org/format/2411.02265">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Hunyuan-Large: An Open-Source MoE Model with 52 Billion Activated Parameters by Tencent </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+X">Xingwu Sun</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yanfeng Chen</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yiqing Huang</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+R">Ruobing Xie</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jiaqi Zhu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kai Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shuaipeng Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhen Yang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+J">Jonny Han</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+X">Xiaobo Shu</a>, <a href="/search/cs?searchtype=author&query=Bu%2C+J">Jiahao Bu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhongzhi Chen</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+X">Xuemeng Huang</a>, <a href="/search/cs?searchtype=author&query=Lian%2C+F">Fengzong Lian</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Saiyong Yang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+J">Jianfeng Yan</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+Y">Yuyuan Zeng</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+X">Xiaoqin Ren</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+C">Chao Yu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+L">Lulu Wu</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+Y">Yue Mao</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+J">Jun Xia</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+T">Tao Yang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+S">Suncong Zheng</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+K">Kan Wu</a> , et al. (83 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02265v3-abstract-short" style="display: inline;"> In this paper, we introduce Hunyuan-Large, which is currently the largest open-source Transformer-based mixture of experts model, with a total of 389 billion parameters and 52 billion activation parameters, capable of handling up to 256K tokens. We conduct a thorough evaluation of Hunyuan-Large's superior performance across various benchmarks including language understanding and generation, logica… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02265v3-abstract-full').style.display = 'inline'; document.getElementById('2411.02265v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02265v3-abstract-full" style="display: none;"> In this paper, we introduce Hunyuan-Large, which is currently the largest open-source Transformer-based mixture of experts model, with a total of 389 billion parameters and 52 billion activation parameters, capable of handling up to 256K tokens. We conduct a thorough evaluation of Hunyuan-Large's superior performance across various benchmarks including language understanding and generation, logical reasoning, mathematical problem-solving, coding, long-context, and aggregated tasks, where it outperforms LLama3.1-70B and exhibits comparable performance when compared to the significantly larger LLama3.1-405B model. Key practice of Hunyuan-Large include large-scale synthetic data that is orders larger than in previous literature, a mixed expert routing strategy, a key-value cache compression technique, and an expert-specific learning rate strategy. Additionally, we also investigate the scaling laws and learning rate schedule of mixture of experts models, providing valuable insights and guidances for future model development and optimization. The code and checkpoints of Hunyuan-Large are released to facilitate future innovations and applications. Codes: https://github.com/Tencent/Hunyuan-Large Models: https://huggingface.co/tencent/Tencent-Hunyuan-Large <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02265v3-abstract-full').style.display = 'none'; document.getElementById('2411.02265v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages, 4 Figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01573">arXiv:2411.01573</a> <span> [<a href="https://arxiv.org/pdf/2411.01573">pdf</a>, <a href="https://arxiv.org/format/2411.01573">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Conditional Controllable Image Fusion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cao%2C+B">Bing Cao</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xingxin Xu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+P">Pengfei Zhu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qilong Wang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Q">Qinghua Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01573v1-abstract-short" style="display: inline;"> Image fusion aims to integrate complementary information from multiple input images acquired through various sources to synthesize a new fused image. Existing methods usually employ distinct constraint designs tailored to specific scenes, forming fixed fusion paradigms. However, this data-driven fusion approach is challenging to deploy in varying scenarios, especially in rapidly changing environme… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01573v1-abstract-full').style.display = 'inline'; document.getElementById('2411.01573v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01573v1-abstract-full" style="display: none;"> Image fusion aims to integrate complementary information from multiple input images acquired through various sources to synthesize a new fused image. Existing methods usually employ distinct constraint designs tailored to specific scenes, forming fixed fusion paradigms. However, this data-driven fusion approach is challenging to deploy in varying scenarios, especially in rapidly changing environments. To address this issue, we propose a conditional controllable fusion (CCF) framework for general image fusion tasks without specific training. Due to the dynamic differences of different samples, our CCF employs specific fusion constraints for each individual in practice. Given the powerful generative capabilities of the denoising diffusion model, we first inject the specific constraints into the pre-trained DDPM as adaptive fusion conditions. The appropriate conditions are dynamically selected to ensure the fusion process remains responsive to the specific requirements in each reverse diffusion stage. Thus, CCF enables conditionally calibrating the fused images step by step. Extensive experiments validate our effectiveness in general fusion tasks across diverse scenarios against the competing methods without additional training. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01573v1-abstract-full').style.display = 'none'; document.getElementById('2411.01573v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01327">arXiv:2411.01327</a> <span> [<a href="https://arxiv.org/pdf/2411.01327">pdf</a>, <a href="https://arxiv.org/format/2411.01327">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Visual Fourier Prompt Tuning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zeng%2C+R">Runjia Zeng</a>, <a href="/search/cs?searchtype=author&query=Han%2C+C">Cheng Han</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qifan Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+C">Chunshu Wu</a>, <a href="/search/cs?searchtype=author&query=Geng%2C+T">Tong Geng</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+L">Lifu Huang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y+N">Ying Nian Wu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+D">Dongfang Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01327v2-abstract-short" style="display: inline;"> With the scale of vision Transformer-based models continuing to grow, finetuning these large-scale pretrained models for new tasks has become increasingly parameter-intensive. Visual prompt tuning is introduced as a parameter-efficient finetuning (PEFT) method to this trend. Despite its successes, a notable research challenge persists within almost all PEFT approaches: significant performance degr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01327v2-abstract-full').style.display = 'inline'; document.getElementById('2411.01327v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01327v2-abstract-full" style="display: none;"> With the scale of vision Transformer-based models continuing to grow, finetuning these large-scale pretrained models for new tasks has become increasingly parameter-intensive. Visual prompt tuning is introduced as a parameter-efficient finetuning (PEFT) method to this trend. Despite its successes, a notable research challenge persists within almost all PEFT approaches: significant performance degradation is observed when there is a substantial disparity between the datasets applied in pretraining and finetuning phases. To address this challenge, we draw inspiration from human visual cognition, and propose the Visual Fourier Prompt Tuning (VFPT) method as a general and effective solution for adapting large-scale transformer-based models. Our approach innovatively incorporates the Fast Fourier Transform into prompt embeddings and harmoniously considers both spatial and frequency domain information. Apart from its inherent simplicity and intuitiveness, VFPT exhibits superior performance across all datasets, offering a general solution to dataset challenges, irrespective of data disparities. Empirical results demonstrate that our approach outperforms current state-of-the-art baselines on two benchmarks, with low parameter usage (e.g., 0.57% of model parameters on VTAB-1k) and notable performance enhancements (e.g., 73.20% of mean accuracy on VTAB-1k). Our code is avaliable at https://github.com/runtsang/VFPT. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01327v2-abstract-full').style.display = 'none'; document.getElementById('2411.01327v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">[NeurIPS 2024] Homepage: https://runjia.tech/vfpt_page/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01172">arXiv:2411.01172</a> <span> [<a href="https://arxiv.org/pdf/2411.01172">pdf</a>, <a href="https://arxiv.org/format/2411.01172">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Covariance-based Space Regularization for Few-shot Class Incremental Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yijie Hu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+G">Guanyu Yang</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+Z">Zhaorui Tan</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+X">Xiaowei Huang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+K">Kaizhu Huang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qiu-Feng Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01172v1-abstract-short" style="display: inline;"> Few-shot Class Incremental Learning (FSCIL) presents a challenging yet realistic scenario, which requires the model to continually learn new classes with limited labeled data (i.e., incremental sessions) while retaining knowledge of previously learned base classes (i.e., base sessions). Due to the limited data in incremental sessions, models are prone to overfitting new classes and suffering catas… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01172v1-abstract-full').style.display = 'inline'; document.getElementById('2411.01172v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01172v1-abstract-full" style="display: none;"> Few-shot Class Incremental Learning (FSCIL) presents a challenging yet realistic scenario, which requires the model to continually learn new classes with limited labeled data (i.e., incremental sessions) while retaining knowledge of previously learned base classes (i.e., base sessions). Due to the limited data in incremental sessions, models are prone to overfitting new classes and suffering catastrophic forgetting of base classes. To tackle these issues, recent advancements resort to prototype-based approaches to constrain the base class distribution and learn discriminative representations of new classes. Despite the progress, the limited data issue still induces ill-divided feature space, leading the model to confuse the new class with old classes or fail to facilitate good separation among new classes. In this paper, we aim to mitigate these issues by directly constraining the span of each class distribution from a covariance perspective. In detail, we propose a simple yet effective covariance constraint loss to force the model to learn each class distribution with the same covariance matrix. In addition, we propose a perturbation approach to perturb the few-shot training samples in the feature space, which encourages the samples to be away from the weighted distribution of other classes. Regarding perturbed samples as new class data, the classifier is forced to establish explicit boundaries between each new class and the existing ones. Our approach is easy to integrate into existing FSCIL approaches to boost performance. Experiments on three benchmarks validate the effectiveness of our approach, achieving a new state-of-the-art performance of FSCIL. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01172v1-abstract-full').style.display = 'none'; document.getElementById('2411.01172v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">WACV2025,10 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00888">arXiv:2411.00888</a> <span> [<a href="https://arxiv.org/pdf/2411.00888">pdf</a>, <a href="https://arxiv.org/format/2411.00888">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neurons and Cognition">q-bio.NC</span> </div> </div> <p class="title is-5 mathjax"> Topology-Aware Graph Augmentation for Predicting Clinical Trajectories in Neurocognitive Disorders </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qianqian Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wei Wang</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Y">Yuqi Fang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Hong-Jun Li</a>, <a href="/search/cs?searchtype=author&query=Bozoki%2C+A">Andrea Bozoki</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+M">Mingxia Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00888v1-abstract-short" style="display: inline;"> Brain networks/graphs derived from resting-state functional MRI (fMRI) help study underlying pathophysiology of neurocognitive disorders by measuring neuronal activities in the brain. Some studies utilize learning-based methods for brain network analysis, but typically suffer from low model generalizability caused by scarce labeled fMRI data. As a notable self-supervised strategy, graph contrastiv… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00888v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00888v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00888v1-abstract-full" style="display: none;"> Brain networks/graphs derived from resting-state functional MRI (fMRI) help study underlying pathophysiology of neurocognitive disorders by measuring neuronal activities in the brain. Some studies utilize learning-based methods for brain network analysis, but typically suffer from low model generalizability caused by scarce labeled fMRI data. As a notable self-supervised strategy, graph contrastive learning helps leverage auxiliary unlabeled data. But existing methods generally arbitrarily perturb graph nodes/edges to generate augmented graphs, without considering essential topology information of brain networks. To this end, we propose a topology-aware graph augmentation (TGA) framework, comprising a pretext model to train a generalizable encoder on large-scale unlabeled fMRI cohorts and a task-specific model to perform downstream tasks on a small target dataset. In the pretext model, we design two novel topology-aware graph augmentation strategies: (1) hub-preserving node dropping that prioritizes preserving brain hub regions according to node importance, and (2) weight-dependent edge removing that focuses on keeping important functional connectivities based on edge weights. Experiments on 1, 688 fMRI scans suggest that TGA outperforms several state-of-the-art methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00888v1-abstract-full').style.display = 'none'; document.getElementById('2411.00888v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00444">arXiv:2411.00444</a> <span> [<a href="https://arxiv.org/pdf/2411.00444">pdf</a>, <a href="https://arxiv.org/format/2411.00444">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Expert-level protocol translation for self-driving labs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yu-Zhe Shi</a>, <a href="/search/cs?searchtype=author&query=Meng%2C+F">Fanxu Meng</a>, <a href="/search/cs?searchtype=author&query=Hou%2C+H">Haofei Hou</a>, <a href="/search/cs?searchtype=author&query=Bi%2C+Z">Zhangqian Bi</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Q">Qiao Xu</a>, <a href="/search/cs?searchtype=author&query=Ruan%2C+L">Lecheng Ruan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qining Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00444v1-abstract-short" style="display: inline;"> Recent development in Artificial Intelligence (AI) models has propelled their application in scientific discovery, but the validation and exploration of these discoveries require subsequent empirical experimentation. The concept of self-driving laboratories promises to automate and thus boost the experimental process following AI-driven discoveries. However, the transition of experimental protocol… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00444v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00444v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00444v1-abstract-full" style="display: none;"> Recent development in Artificial Intelligence (AI) models has propelled their application in scientific discovery, but the validation and exploration of these discoveries require subsequent empirical experimentation. The concept of self-driving laboratories promises to automate and thus boost the experimental process following AI-driven discoveries. However, the transition of experimental protocols, originally crafted for human comprehension, into formats interpretable by machines presents significant challenges, which, within the context of specific expert domain, encompass the necessity for structured as opposed to natural language, the imperative for explicit rather than tacit knowledge, and the preservation of causality and consistency throughout protocol steps. Presently, the task of protocol translation predominantly requires the manual and labor-intensive involvement of domain experts and information technology specialists, rendering the process time-intensive. To address these issues, we propose a framework that automates the protocol translation process through a three-stage workflow, which incrementally constructs Protocol Dependence Graphs (PDGs) that approach structured on the syntax level, completed on the semantics level, and linked on the execution level. Quantitative and qualitative evaluations have demonstrated its performance at par with that of human experts, underscoring its potential to significantly expedite and democratize the process of scientific discovery by elevating the automation capabilities within self-driving laboratories. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00444v1-abstract-full').style.display = 'none'; document.getElementById('2411.00444v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">In Advances in Neural Information Processing Systems (NeurIPS'24)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00387">arXiv:2411.00387</a> <span> [<a href="https://arxiv.org/pdf/2411.00387">pdf</a>, <a href="https://arxiv.org/format/2411.00387">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> STEM-POM: Evaluating Language Models Math-Symbol Reasoning in Document Parsing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zou%2C+J">Jiaru Zou</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qing Wang</a>, <a href="/search/cs?searchtype=author&query=Thakur%2C+P">Pratyush Thakur</a>, <a href="/search/cs?searchtype=author&query=Kani%2C+N">Nickvash Kani</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00387v1-abstract-short" style="display: inline;"> Advances in large language models (LLMs) have spurred research into enhancing their reasoning capabilities, particularly in math-rich STEM documents. While LLMs can generate equations or solve math-related queries, their ability to fully understand and interpret abstract mathematical symbols in long, math-rich documents remains limited. In this paper, we introduce STEM-PoM, a comprehensive benchma… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00387v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00387v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00387v1-abstract-full" style="display: none;"> Advances in large language models (LLMs) have spurred research into enhancing their reasoning capabilities, particularly in math-rich STEM documents. While LLMs can generate equations or solve math-related queries, their ability to fully understand and interpret abstract mathematical symbols in long, math-rich documents remains limited. In this paper, we introduce STEM-PoM, a comprehensive benchmark dataset designed to evaluate LLMs' reasoning abilities on math symbols within contextual scientific text. The dataset, sourced from real-world ArXiv documents, contains over 2K math symbols classified as main attributes of variables, constants, operators, and unit descriptors, with additional sub-attributes including scalar/vector/matrix for variables and local/global/discipline-specific labels for both constants and operators. Our extensive experiments show that state-of-the-art LLMs achieve an average of 20-60% accuracy under in-context learning and 50-60% accuracy with fine-tuning, revealing a significant gap in their mathematical reasoning capabilities. STEM-PoM fuels future research of developing advanced Math-AI models that can robustly handle math symbols. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00387v1-abstract-full').style.display = 'none'; document.getElementById('2411.00387v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to NeurIPS Math-AI 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00040">arXiv:2411.00040</a> <span> [<a href="https://arxiv.org/pdf/2411.00040">pdf</a>, <a href="https://arxiv.org/format/2411.00040">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Numerical Analysis">math.NA</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> P$^2$C$^2$Net: PDE-Preserved Coarse Correction Network for efficient prediction of spatiotemporal dynamics </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qi Wang</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+P">Pu Ren</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+H">Hao Zhou</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xin-Yang Liu</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+Z">Zhiwen Deng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yi Zhang</a>, <a href="/search/cs?searchtype=author&query=Chengze%2C+R">Ruizhi Chengze</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Hongsheng Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zidong Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jian-Xun Wang</a>, <a href="/search/cs?searchtype=author&query=Ji-Rong_Wen"> Ji-Rong_Wen</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+H">Hao Sun</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yang Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00040v1-abstract-short" style="display: inline;"> When solving partial differential equations (PDEs), classical numerical methods often require fine mesh grids and small time stepping to meet stability, consistency, and convergence conditions, leading to high computational cost. Recently, machine learning has been increasingly utilized to solve PDE problems, but they often encounter challenges related to interpretability, generalizability, and st… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00040v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00040v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00040v1-abstract-full" style="display: none;"> When solving partial differential equations (PDEs), classical numerical methods often require fine mesh grids and small time stepping to meet stability, consistency, and convergence conditions, leading to high computational cost. Recently, machine learning has been increasingly utilized to solve PDE problems, but they often encounter challenges related to interpretability, generalizability, and strong dependency on rich labeled data. Hence, we introduce a new PDE-Preserved Coarse Correction Network (P$^2$C$^2$Net) to efficiently solve spatiotemporal PDE problems on coarse mesh grids in small data regimes. The model consists of two synergistic modules: (1) a trainable PDE block that learns to update the coarse solution (i.e., the system state), based on a high-order numerical scheme with boundary condition encoding, and (2) a neural network block that consistently corrects the solution on the fly. In particular, we propose a learnable symmetric Conv filter, with weights shared over the entire model, to accurately estimate the spatial derivatives of PDE based on the neural-corrected system state. The resulting physics-encoded model is capable of handling limited training data (e.g., 3--5 trajectories) and accelerates the prediction of PDE solutions on coarse spatiotemporal grids while maintaining a high accuracy. P$^2$C$^2$Net achieves consistent state-of-the-art performance with over 50\% gain (e.g., in terms of relative prediction error) across four datasets covering complex reaction-diffusion processes and turbulent flows. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00040v1-abstract-full').style.display = 'none'; document.getElementById('2411.00040v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23958">arXiv:2410.23958</a> <span> [<a href="https://arxiv.org/pdf/2410.23958">pdf</a>, <a href="https://arxiv.org/format/2410.23958">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantum Physics">quant-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computational Complexity">cs.CC</span> </div> </div> <p class="title is-5 mathjax"> Space-bounded quantum interactive proof systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gall%2C+F+L">Fran莽ois Le Gall</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yupan Liu</a>, <a href="/search/cs?searchtype=author&query=Nishimura%2C+H">Harumichi Nishimura</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qisheng Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23958v1-abstract-short" style="display: inline;"> We introduce two models of space-bounded quantum interactive proof systems, ${\sf QIPL}$ and ${\sf QIP_{\rm U}L}$. The ${\sf QIP_{\rm U}L}$ model, a space-bounded variant of quantum interactive proofs (${\sf QIP}$) introduced by Watrous (CC 2003) and Kitaev and Watrous (STOC 2000), restricts verifier actions to unitary circuits. In contrast, ${\sf QIPL}$ allows logarithmically many intermediate me… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23958v1-abstract-full').style.display = 'inline'; document.getElementById('2410.23958v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23958v1-abstract-full" style="display: none;"> We introduce two models of space-bounded quantum interactive proof systems, ${\sf QIPL}$ and ${\sf QIP_{\rm U}L}$. The ${\sf QIP_{\rm U}L}$ model, a space-bounded variant of quantum interactive proofs (${\sf QIP}$) introduced by Watrous (CC 2003) and Kitaev and Watrous (STOC 2000), restricts verifier actions to unitary circuits. In contrast, ${\sf QIPL}$ allows logarithmically many intermediate measurements per verifier action (with a high-concentration condition on yes instances), making it the weakest model that encompasses the classical model of Condon and Ladner (JCSS 1995). We characterize the computational power of ${\sf QIPL}$ and ${\sf QIP_{\rm U}L}$. When the message number $m$ is polynomially bounded, ${\sf QIP_{\rm U}L} \subsetneq {\sf QIPL}$ unless ${\sf P} = {\sf NP}$: - ${\sf QIPL}$ exactly characterizes ${\sf NP}$. - ${\sf QIP_{\rm U}L}$ is contained in ${\sf P}$ and contains ${\sf SAC}^1 \cup {\sf BQL}$, where ${\sf SAC}^1$ denotes problems solvable by classical logarithmic-depth, semi-unbounded fan-in circuits. However, this distinction vanishes when $m$ is constant. Our results further indicate that intermediate measurements uniquely impact space-bounded quantum interactive proofs, unlike in space-bounded quantum computation, where ${\sf BQL}={\sf BQ_{\rm U}L}$. We also introduce space-bounded unitary quantum statistical zero-knowledge (${\sf QSZK_{\rm U}L}$), a specific form of ${\sf QIP_{\rm U}L}$ proof systems with statistical zero-knowledge against any verifier. This class is a space-bounded variant of quantum statistical zero-knowledge (${\sf QSZK}$) defined by Watrous (SICOMP 2009). We prove that ${\sf QSZK_{\rm U}L} = {\sf BQL}$, implying that the statistical zero-knowledge property negates the computational advantage typically gained from the interaction. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23958v1-abstract-full').style.display = 'none'; document.getElementById('2410.23958v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">50 pages, 4 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23828">arXiv:2410.23828</a> <span> [<a href="https://arxiv.org/pdf/2410.23828">pdf</a>, <a href="https://arxiv.org/format/2410.23828">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Show Me What and Where has Changed? Question Answering and Grounding for Remote Sensing Change Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+K">Ke Li</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+F">Fuyu Dong</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+D">Di Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shaofeng Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Quan Wang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+X">Xinbo Gao</a>, <a href="/search/cs?searchtype=author&query=Chua%2C+T">Tat-Seng Chua</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23828v2-abstract-short" style="display: inline;"> Remote sensing change detection aims to perceive changes occurring on the Earth's surface from remote sensing data in different periods, and feed these changes back to humans. However, most existing methods only focus on detecting change regions, lacking the capability to interact with users to identify changes that the users expect. In this paper, we introduce a new task named Change Detection Qu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23828v2-abstract-full').style.display = 'inline'; document.getElementById('2410.23828v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23828v2-abstract-full" style="display: none;"> Remote sensing change detection aims to perceive changes occurring on the Earth's surface from remote sensing data in different periods, and feed these changes back to humans. However, most existing methods only focus on detecting change regions, lacking the capability to interact with users to identify changes that the users expect. In this paper, we introduce a new task named Change Detection Question Answering and Grounding (CDQAG), which extends the traditional change detection task by providing interpretable textual answers and intuitive visual evidence. To this end, we construct the first CDQAG benchmark dataset, termed QAG-360K, comprising over 360K triplets of questions, textual answers, and corresponding high-quality visual masks. It encompasses 10 essential land-cover categories and 8 comprehensive question types, which provides a valuable and diverse dataset for remote sensing applications. Furthermore, we present VisTA, a simple yet effective baseline method that unifies the tasks of question answering and grounding by delivering both visual and textual answers. Our method achieves state-of-the-art results on both the classic change detection-based visual question answering (CDVQA) and the proposed CDQAG datasets. Extensive qualitative and quantitative experimental results provide useful insights for developing better CDQAG models, and we hope that our work can inspire further research in this important yet underexplored research field. The proposed benchmark dataset and method are available at https://github.com/like413/VisTA. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23828v2-abstract-full').style.display = 'none'; document.getElementById('2410.23828v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23758">arXiv:2410.23758</a> <span> [<a href="https://arxiv.org/pdf/2410.23758">pdf</a>, <a href="https://arxiv.org/format/2410.23758">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Reverse Attitude Statistics Based Star Map Identification Method </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dong%2C+S">Shunmei Dong</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qinglong Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haiqing Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qianqian Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23758v1-abstract-short" style="display: inline;"> The star tracker is generally affected by the atmospheric background light and the aerodynamic environment when working in near space, which results in missing stars or false stars. Moreover, high-speed maneuvering may cause star trailing, which reduces the accuracy of the star position. To address the challenges for starmap identification, a reverse attitude statistics based method is proposed to… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23758v1-abstract-full').style.display = 'inline'; document.getElementById('2410.23758v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23758v1-abstract-full" style="display: none;"> The star tracker is generally affected by the atmospheric background light and the aerodynamic environment when working in near space, which results in missing stars or false stars. Moreover, high-speed maneuvering may cause star trailing, which reduces the accuracy of the star position. To address the challenges for starmap identification, a reverse attitude statistics based method is proposed to handle position noise, false stars, and missing stars. Conversely to existing methods which match before solving for attitude, this method introduces attitude solving into the matching process, and obtains the final match and the correct attitude simultaneously by frequency statistics. Firstly, based on stable angular distance features, the initial matching is obtained by utilizing spatial hash indexing. Then, the dual-vector attitude determination is introduced to calculate potential attitude. Finally, the star pairs are accurately matched by applying a frequency statistics filtering method. In addition, Bayesian optimization is employed to find optimal parameters under the impact of noises, which is able to enhance the algorithm performance further. In this work, the proposed method is validated in simulation, field test and on-orbit experiment. Compared with the state-of-the-art, the identification rate is improved by more than 14.3%, and the solving time is reduced by over 28.5%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23758v1-abstract-full').style.display = 'none'; document.getElementById('2410.23758v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 17figures, 4 tables, 4663 words, submitted to IEEE Sensors Journal</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23683">arXiv:2410.23683</a> <span> [<a href="https://arxiv.org/pdf/2410.23683">pdf</a>, <a href="https://arxiv.org/format/2410.23683">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Unveiling User Satisfaction and Creator Productivity Trade-Offs in Recommendation Platforms </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yao%2C+F">Fan Yao</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+Y">Yiming Liao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jingzhou Liu</a>, <a href="/search/cs?searchtype=author&query=Nie%2C+S">Shaoliang Nie</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qifan Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+H">Haifeng Xu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hongning Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23683v2-abstract-short" style="display: inline;"> On User-Generated Content (UGC) platforms, recommendation algorithms significantly impact creators' motivation to produce content as they compete for algorithmically allocated user traffic. This phenomenon subtly shapes the volume and diversity of the content pool, which is crucial for the platform's sustainability. In this work, we demonstrate, both theoretically and empirically, that a purely re… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23683v2-abstract-full').style.display = 'inline'; document.getElementById('2410.23683v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23683v2-abstract-full" style="display: none;"> On User-Generated Content (UGC) platforms, recommendation algorithms significantly impact creators' motivation to produce content as they compete for algorithmically allocated user traffic. This phenomenon subtly shapes the volume and diversity of the content pool, which is crucial for the platform's sustainability. In this work, we demonstrate, both theoretically and empirically, that a purely relevance-driven policy with low exploration strength boosts short-term user satisfaction but undermines the long-term richness of the content pool. In contrast, a more aggressive exploration policy may slightly compromise user satisfaction but promote higher content creation volume. Our findings reveal a fundamental trade-off between immediate user satisfaction and overall content production on UGC platforms. Building on this finding, we propose an efficient optimization method to identify the optimal exploration strength, balancing user and creator engagement. Our model can serve as a pre-deployment audit tool for recommendation algorithms on UGC platforms, helping to align their immediate objectives with sustainable, long-term goals. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23683v2-abstract-full').style.display = 'none'; document.getElementById('2410.23683v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23039">arXiv:2410.23039</a> <span> [<a href="https://arxiv.org/pdf/2410.23039">pdf</a>, <a href="https://arxiv.org/format/2410.23039">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Neural Attention Field: Emerging Point Relevance in 3D Scenes for One-Shot Dexterous Grasping </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qianxu Wang</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+C">Congyue Deng</a>, <a href="/search/cs?searchtype=author&query=Lum%2C+T+G+W">Tyler Ga Wei Lum</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yuanpei Chen</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yaodong Yang</a>, <a href="/search/cs?searchtype=author&query=Bohg%2C+J">Jeannette Bohg</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yixin Zhu</a>, <a href="/search/cs?searchtype=author&query=Guibas%2C+L">Leonidas Guibas</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23039v1-abstract-short" style="display: inline;"> One-shot transfer of dexterous grasps to novel scenes with object and context variations has been a challenging problem. While distilled feature fields from large vision models have enabled semantic correspondences across 3D scenes, their features are point-based and restricted to object surfaces, limiting their capability of modeling complex semantic feature distributions for hand-object interact… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23039v1-abstract-full').style.display = 'inline'; document.getElementById('2410.23039v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23039v1-abstract-full" style="display: none;"> One-shot transfer of dexterous grasps to novel scenes with object and context variations has been a challenging problem. While distilled feature fields from large vision models have enabled semantic correspondences across 3D scenes, their features are point-based and restricted to object surfaces, limiting their capability of modeling complex semantic feature distributions for hand-object interactions. In this work, we propose the \textit{neural attention field} for representing semantic-aware dense feature fields in the 3D space by modeling inter-point relevance instead of individual point features. Core to it is a transformer decoder that computes the cross-attention between any 3D query point with all the scene points, and provides the query point feature with an attention-based aggregation. We further propose a self-supervised framework for training the transformer decoder from only a few 3D pointclouds without hand demonstrations. Post-training, the attention field can be applied to novel scenes for semantics-aware dexterous grasping from one-shot demonstration. Experiments show that our method provides better optimization landscapes by encouraging the end-effector to focus on task-relevant scene regions, resulting in significant improvements in success rates on real robots compared with the feature-field-based methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23039v1-abstract-full').style.display = 'none'; document.getElementById('2410.23039v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.22788">arXiv:2410.22788</a> <span> [<a href="https://arxiv.org/pdf/2410.22788">pdf</a>, <a href="https://arxiv.org/format/2410.22788">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Theoretical Investigations and Practical Enhancements on Tail Task Risk Minimization in Meta Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lv%2C+Y">Yiqin Lv</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qi Wang</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+D">Dong Liang</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+Z">Zheng Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.22788v1-abstract-short" style="display: inline;"> Meta learning is a promising paradigm in the era of large models and task distributional robustness has become an indispensable consideration in real-world scenarios. Recent advances have examined the effectiveness of tail task risk minimization in fast adaptation robustness improvement \citep{wang2023simple}. This work contributes to more theoretical investigations and practical enhancements in t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22788v1-abstract-full').style.display = 'inline'; document.getElementById('2410.22788v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.22788v1-abstract-full" style="display: none;"> Meta learning is a promising paradigm in the era of large models and task distributional robustness has become an indispensable consideration in real-world scenarios. Recent advances have examined the effectiveness of tail task risk minimization in fast adaptation robustness improvement \citep{wang2023simple}. This work contributes to more theoretical investigations and practical enhancements in the field. Specifically, we reduce the distributionally robust strategy to a max-min optimization problem, constitute the Stackelberg equilibrium as the solution concept, and estimate the convergence rate. In the presence of tail risk, we further derive the generalization bound, establish connections with estimated quantiles, and practically improve the studied strategy. Accordingly, extensive evaluations demonstrate the significance of our proposal and its scalability to multimodal large models in boosting robustness. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22788v1-abstract-full').style.display = 'none'; document.getElementById('2410.22788v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.22448">arXiv:2410.22448</a> <span> [<a href="https://arxiv.org/pdf/2410.22448">pdf</a>, <a href="https://arxiv.org/format/2410.22448">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> A Closer Look at Neural Codec Resynthesis: Bridging the Gap between Codec and Waveform Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+A+H">Alexander H. Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qirui Wang</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+Y">Yuan Gong</a>, <a href="/search/cs?searchtype=author&query=Glass%2C+J">James Glass</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.22448v1-abstract-short" style="display: inline;"> Neural Audio Codecs, initially designed as a compression technique, have gained more attention recently for speech generation. Codec models represent each audio frame as a sequence of tokens, i.e., discrete embeddings. The discrete and low-frequency nature of neural codecs introduced a new way to generate speech with token-based models. As these tokens encode information at various levels of granu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22448v1-abstract-full').style.display = 'inline'; document.getElementById('2410.22448v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.22448v1-abstract-full" style="display: none;"> Neural Audio Codecs, initially designed as a compression technique, have gained more attention recently for speech generation. Codec models represent each audio frame as a sequence of tokens, i.e., discrete embeddings. The discrete and low-frequency nature of neural codecs introduced a new way to generate speech with token-based models. As these tokens encode information at various levels of granularity, from coarse to fine, most existing works focus on how to better generate the coarse tokens. In this paper, we focus on an equally important but often overlooked question: How can we better resynthesize the waveform from coarse tokens? We point out that both the choice of learning target and resynthesis approach have a dramatic impact on the generated audio quality. Specifically, we study two different strategies based on token prediction and regression, and introduce a new method based on Schr枚dinger Bridge. We examine how different design choices affect machine and human perception. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22448v1-abstract-full').style.display = 'none'; document.getElementById('2410.22448v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2024 Audio Imagination workshop paper; demo page at https://alexander-h-liu.github.io/codec-resyn.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.22339">arXiv:2410.22339</a> <span> [<a href="https://arxiv.org/pdf/2410.22339">pdf</a>, <a href="https://arxiv.org/format/2410.22339">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> </div> <p class="title is-5 mathjax"> DAWN: Designing Distributed Agents in a Worldwide Network </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Aminiranjbar%2C+Z">Zahra Aminiranjbar</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+J">Jianan Tang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qiudan Wang</a>, <a href="/search/cs?searchtype=author&query=Pant%2C+S">Shubha Pant</a>, <a href="/search/cs?searchtype=author&query=Viswanathan%2C+M">Mahesh Viswanathan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.22339v2-abstract-short" style="display: inline;"> The rapid evolution of Large Language Models (LLMs) has transformed them from basic conversational tools into sophisticated entities capable of complex reasoning and decision-making. These advancements have led to the development of specialized LLM-based agents designed for diverse tasks such as coding and web browsing. As these agents become more capable, the need for a robust framework that faci… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22339v2-abstract-full').style.display = 'inline'; document.getElementById('2410.22339v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.22339v2-abstract-full" style="display: none;"> The rapid evolution of Large Language Models (LLMs) has transformed them from basic conversational tools into sophisticated entities capable of complex reasoning and decision-making. These advancements have led to the development of specialized LLM-based agents designed for diverse tasks such as coding and web browsing. As these agents become more capable, the need for a robust framework that facilitates global communication and collaboration among them towards advanced objectives has become increasingly critical. Distributed Agents in a Worldwide Network (DAWN) addresses this need by offering a versatile framework that integrates LLM-based agents with traditional software systems, enabling the creation of agentic applications suited for a wide range of use cases. DAWN enables distributed agents worldwide to register and be easily discovered through Gateway Agents. Collaborations among these agents are coordinated by a Principal Agent equipped with reasoning strategies. DAWN offers three operational modes: No-LLM Mode for deterministic tasks, Copilot for augmented decision-making, and LLM Agent for autonomous operations. Additionally, DAWN ensures the safety and security of agent collaborations globally through a dedicated safety, security, and compliance layer, protecting the network against attackers and adhering to stringent security and compliance standards. These features make DAWN a robust network for deploying agent-based applications across various industries. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22339v2-abstract-full').style.display = 'none'; document.getElementById('2410.22339v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.22114">arXiv:2410.22114</a> <span> [<a href="https://arxiv.org/pdf/2410.22114">pdf</a>, <a href="https://arxiv.org/format/2410.22114">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Policy Gradient for Robust Markov Decision Processes </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qiuhao Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+S">Shaohang Xu</a>, <a href="/search/cs?searchtype=author&query=Ho%2C+C+P">Chin Pang Ho</a>, <a href="/search/cs?searchtype=author&query=Petrik%2C+M">Marek Petrik</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.22114v2-abstract-short" style="display: inline;"> We develop a generic policy gradient method with the global optimality guarantee for robust Markov Decision Processes (MDPs). While policy gradient methods are widely used for solving dynamic decision problems due to their scalable and efficient nature, adapting these methods to account for model ambiguity has been challenging, often making it impractical to learn robust policies. This paper intro… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22114v2-abstract-full').style.display = 'inline'; document.getElementById('2410.22114v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.22114v2-abstract-full" style="display: none;"> We develop a generic policy gradient method with the global optimality guarantee for robust Markov Decision Processes (MDPs). While policy gradient methods are widely used for solving dynamic decision problems due to their scalable and efficient nature, adapting these methods to account for model ambiguity has been challenging, often making it impractical to learn robust policies. This paper introduces a novel policy gradient method, Double-Loop Robust Policy Mirror Descent (DRPMD), for solving robust MDPs. DRPMD employs a general mirror descent update rule for the policy optimization with adaptive tolerance per iteration, guaranteeing convergence to a globally optimal policy. We provide a comprehensive analysis of DRPMD, including new convergence results under both direct and softmax parameterizations, and provide novel insights into the inner problem solution through Transition Mirror Ascent (TMA). Additionally, we propose innovative parametric transition kernels for both discrete and continuous state-action spaces, broadening the applicability of our approach. Empirical results validate the robustness and global convergence of DRPMD across various challenging robust MDP settings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22114v2-abstract-full').style.display = 'none'; document.getElementById('2410.22114v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.21358">arXiv:2410.21358</a> <span> [<a href="https://arxiv.org/pdf/2410.21358">pdf</a>, <a href="https://arxiv.org/format/2410.21358">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> "We do use it, but not how hearing people think": How the Deaf and Hard of Hearing Community Uses Large Language Model Tools </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huffman%2C+S">Shuxu Huffman</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Si Chen</a>, <a href="/search/cs?searchtype=author&query=Mack%2C+K+A">Kelly Avery Mack</a>, <a href="/search/cs?searchtype=author&query=Su%2C+H">Haotian Su</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qi Wang</a>, <a href="/search/cs?searchtype=author&query=Kushalnagar%2C+R">Raja Kushalnagar</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.21358v2-abstract-short" style="display: inline;"> Generative AI tools, particularly those utilizing large language models (LLMs), have become increasingly prevalent in both professional and personal contexts, offering powerful capabilities for text generation and communication support. While these tools are widely used to enhance productivity and accessibility, there has been limited exploration of how Deaf and Hard of Hearing (DHH) individuals e… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21358v2-abstract-full').style.display = 'inline'; document.getElementById('2410.21358v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.21358v2-abstract-full" style="display: none;"> Generative AI tools, particularly those utilizing large language models (LLMs), have become increasingly prevalent in both professional and personal contexts, offering powerful capabilities for text generation and communication support. While these tools are widely used to enhance productivity and accessibility, there has been limited exploration of how Deaf and Hard of Hearing (DHH) individuals engage with text-based generative AI tools, as well as the challenges they may encounter. This paper presents a mixed-method survey study investigating how the DHH community uses Text AI tools, such as ChatGPT, to reduce communication barriers, bridge Deaf and hearing cultures, and improve access to information. Through a survey of 80 DHH participants and separate interviews with 11 other participants, we found that while these tools provide significant benefits, including enhanced communication and mental health support, they also introduce barriers, such as a lack of American Sign Language (ASL) support and understanding of Deaf cultural nuances. Our findings highlight unique usage patterns within the DHH community and underscore the need for inclusive design improvements. We conclude by offering practical recommendations to enhance the accessibility of Text AI for the DHH community and suggest directions for future research in AI and accessibility. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21358v2-abstract-full').style.display = 'none'; document.getElementById('2410.21358v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.21201">arXiv:2410.21201</a> <span> [<a href="https://arxiv.org/pdf/2410.21201">pdf</a>, <a href="https://arxiv.org/ps/2410.21201">ps</a>, <a href="https://arxiv.org/format/2410.21201">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantum Physics">quant-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computational Complexity">cs.CC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Data Structures and Algorithms">cs.DS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> </div> </div> <p class="title is-5 mathjax"> Sample-Optimal Quantum Estimators for Pure-State Trace Distance and Fidelity via Samplizer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qisheng Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhicheng Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.21201v1-abstract-short" style="display: inline;"> Trace distance and infidelity (induced by square root fidelity), as basic measures of the closeness of quantum states, are commonly used in quantum state discrimination, certification, and tomography. However, the sample complexity for their estimation still remains open. In this paper, we solve this problem for pure states. We present a quantum algorithm that estimates the trace distance and squa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21201v1-abstract-full').style.display = 'inline'; document.getElementById('2410.21201v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.21201v1-abstract-full" style="display: none;"> Trace distance and infidelity (induced by square root fidelity), as basic measures of the closeness of quantum states, are commonly used in quantum state discrimination, certification, and tomography. However, the sample complexity for their estimation still remains open. In this paper, we solve this problem for pure states. We present a quantum algorithm that estimates the trace distance and square root fidelity between pure states to within additive error $\varepsilon$, given sample access to their identical copies. Our algorithm achieves the optimal sample complexity $螛(1/\varepsilon^2)$, improving the long-standing folklore $O(1/\varepsilon^4)$. Our algorithm is composed of a samplized phase estimation of the product of two Householder reflections. Notably, an improved (multi-)samplizer for pure states is used as an algorithmic tool in our construction, through which any quantum query algorithm using $Q$ queries to the reflection operator about a pure state $|蠄\rangle$ can be converted to a $未$-close (in the diamond norm) quantum sample algorithm using $螛(Q^2/未)$ samples of $|蠄\rangle$. This samplizer for pure states is shown to be optimal. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21201v1-abstract-full').style.display = 'none'; document.getElementById('2410.21201v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">24 pages, 3 figures, 1 table, 1 algorithm</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20488">arXiv:2410.20488</a> <span> [<a href="https://arxiv.org/pdf/2410.20488">pdf</a>, <a href="https://arxiv.org/format/2410.20488">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> FIRP: Faster LLM inference via future intermediate representation prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+P">Pengfei Wu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jiahao Liu</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+Z">Zhuocheng Gong</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qifan Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jinpeng Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jingang Wang</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+X">Xunliang Cai</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+D">Dongyan Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20488v1-abstract-short" style="display: inline;"> Recent advancements in Large Language Models (LLMs) have shown remarkable performance across a wide range of tasks. Despite this, the auto-regressive nature of LLM decoding, which generates only a single token per forward propagation, fails to fully exploit the parallel computational power of GPUs, leading to considerable latency. To address this, we introduce a novel speculative decoding method n… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20488v1-abstract-full').style.display = 'inline'; document.getElementById('2410.20488v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20488v1-abstract-full" style="display: none;"> Recent advancements in Large Language Models (LLMs) have shown remarkable performance across a wide range of tasks. Despite this, the auto-regressive nature of LLM decoding, which generates only a single token per forward propagation, fails to fully exploit the parallel computational power of GPUs, leading to considerable latency. To address this, we introduce a novel speculative decoding method named FIRP which generates multiple tokens instead of one at each decoding step. We achieve this by predicting the intermediate hidden states of future tokens (tokens have not been decoded yet) and then using these pseudo hidden states to decode future tokens, specifically, these pseudo hidden states are predicted with simple linear transformation in intermediate layers of LLMs. Once predicted, they participate in the computation of all the following layers, thereby assimilating richer semantic information. As the layers go deeper, the semantic gap between pseudo and real hidden states is narrowed and it becomes feasible to decode future tokens with high accuracy. To validate the effectiveness of FIRP, we conduct extensive experiments, showing a speedup ratio of 1.9x-3x in several models and datasets, analytical experiments also prove our motivations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20488v1-abstract-full').style.display = 'none'; document.getElementById('2410.20488v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> NLPCC2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20136">arXiv:2410.20136</a> <span> [<a href="https://arxiv.org/pdf/2410.20136">pdf</a>, <a href="https://arxiv.org/format/2410.20136">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> CodePurify: Defend Backdoor Attacks on Neural Code Models via Entropy-based Purification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mu%2C+F">Fangwen Mu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Junjie Wang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Z">Zhuohao Yu</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+L">Lin Shi</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Song Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Mingyang Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qing Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20136v1-abstract-short" style="display: inline;"> Neural code models have found widespread success in tasks pertaining to code intelligence, yet they are vulnerable to backdoor attacks, where an adversary can manipulate the victim model's behavior by inserting triggers into the source code. Recent studies indicate that advanced backdoor attacks can achieve nearly 100% attack success rates on many software engineering tasks. However, effective def… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20136v1-abstract-full').style.display = 'inline'; document.getElementById('2410.20136v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20136v1-abstract-full" style="display: none;"> Neural code models have found widespread success in tasks pertaining to code intelligence, yet they are vulnerable to backdoor attacks, where an adversary can manipulate the victim model's behavior by inserting triggers into the source code. Recent studies indicate that advanced backdoor attacks can achieve nearly 100% attack success rates on many software engineering tasks. However, effective defense techniques against such attacks remain insufficiently explored. In this study, we propose CodePurify, a novel defense against backdoor attacks on code models through entropy-based purification. Entropy-based purification involves the process of precisely detecting and eliminating the possible triggers in the source code while preserving its semantic information. Within this process, CodePurify first develops a confidence-driven entropy-based measurement to determine whether a code snippet is poisoned and, if so, locates the triggers. Subsequently, it purifies the code by substituting the triggers with benign tokens using a masked language model. We extensively evaluate CodePurify against four advanced backdoor attacks across three representative tasks and two popular code models. The results show that CodePurify significantly outperforms four commonly used defense baselines, improving average defense performance by at least 40%, 40%, and 12% across the three tasks, respectively. These findings highlight the potential of CodePurify to serve as a robust defense against backdoor attacks on neural code models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20136v1-abstract-full').style.display = 'none'; document.getElementById('2410.20136v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20132">arXiv:2410.20132</a> <span> [<a href="https://arxiv.org/pdf/2410.20132">pdf</a>, <a href="https://arxiv.org/ps/2410.20132">ps</a>, <a href="https://arxiv.org/format/2410.20132">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Biomolecules">q-bio.BM</span> </div> </div> <p class="title is-5 mathjax"> On-Site Precise Screening of SARS-CoV-2 Systems Using a Channel-Wise Attention-Based PLS-1D-CNN Model with Limited Infrared Signatures </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wenwen Zhang</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Z">Zhouzhuo Tang</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Y">Yingmei Feng</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+X">Xia Yu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q+J">Qi Jie Wang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Z">Zhiping Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20132v1-abstract-short" style="display: inline;"> During the early stages of respiratory virus outbreaks, such as severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2), the efficient utilize of limited nasopharyngeal swabs for rapid and accurate screening is crucial for public health. In this study, we present a methodology that integrates attenuated total reflection-Fourier transform infrared spectroscopy (ATR-FTIR) with the adaptive iter… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20132v1-abstract-full').style.display = 'inline'; document.getElementById('2410.20132v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20132v1-abstract-full" style="display: none;"> During the early stages of respiratory virus outbreaks, such as severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2), the efficient utilize of limited nasopharyngeal swabs for rapid and accurate screening is crucial for public health. In this study, we present a methodology that integrates attenuated total reflection-Fourier transform infrared spectroscopy (ATR-FTIR) with the adaptive iteratively reweighted penalized least squares (airPLS) preprocessing algorithm and a channel-wise attention-based partial least squares one-dimensional convolutional neural network (PLS-1D-CNN) model, enabling accurate screening of infected individuals within 10 minutes. Two cohorts of nasopharyngeal swab samples, comprising 126 and 112 samples from suspected SARS-CoV-2 Omicron variant cases, were collected at Beijing You'an Hospital for verification. Given that ATR-FTIR spectra are highly sensitive to variations in experimental conditions, which can affect their quality, we propose a biomolecular importance (BMI) evaluation method to assess signal quality across different conditions, validated by comparing BMI with PLS-GBM and PLS-RF results. For the ATR-FTIR signals in cohort 2, which exhibited a higher BMI, airPLS was utilized for signal preprocessing, followed by the application of the channel-wise attention-based PLS-1D-CNN model for screening. The experimental results demonstrate that our model outperforms recently reported methods in the field of respiratory virus spectrum detection, achieving a recognition screening accuracy of 96.48%, a sensitivity of 96.24%, a specificity of 97.14%, an F1-score of 96.12%, and an AUC of 0.99. It meets the World Health Organization (WHO) recommended criteria for an acceptable product: sensitivity of 95.00% or greater and specificity of 97.00% or greater for testing prior SARS-CoV-2 infection in moderate to high volume scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20132v1-abstract-full').style.display = 'none'; document.getElementById('2410.20132v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.19843">arXiv:2410.19843</a> <span> [<a href="https://arxiv.org/pdf/2410.19843">pdf</a>, <a href="https://arxiv.org/format/2410.19843">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Artificial intelligence for partial differential equations in computational mechanics: A review </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yizheng Wang</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+J">Jinshuai Bai</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Z">Zhongya Lin</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qimin Wang</a>, <a href="/search/cs?searchtype=author&query=Anitescu%2C+C">Cosmin Anitescu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+J">Jia Sun</a>, <a href="/search/cs?searchtype=author&query=Eshaghi%2C+M+S">Mohammad Sadegh Eshaghi</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+Y">Yuantong Gu</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+X">Xi-Qiao Feng</a>, <a href="/search/cs?searchtype=author&query=Zhuang%2C+X">Xiaoying Zhuang</a>, <a href="/search/cs?searchtype=author&query=Rabczuk%2C+T">Timon Rabczuk</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yinghua Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.19843v1-abstract-short" style="display: inline;"> In recent years, Artificial intelligence (AI) has become ubiquitous, empowering various fields, especially integrating artificial intelligence and traditional science (AI for Science: Artificial intelligence for science), which has attracted widespread attention. In AI for Science, using artificial intelligence algorithms to solve partial differential equations (AI for PDEs: Artificial intelligenc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19843v1-abstract-full').style.display = 'inline'; document.getElementById('2410.19843v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.19843v1-abstract-full" style="display: none;"> In recent years, Artificial intelligence (AI) has become ubiquitous, empowering various fields, especially integrating artificial intelligence and traditional science (AI for Science: Artificial intelligence for science), which has attracted widespread attention. In AI for Science, using artificial intelligence algorithms to solve partial differential equations (AI for PDEs: Artificial intelligence for partial differential equations) has become a focal point in computational mechanics. The core of AI for PDEs is the fusion of data and partial differential equations (PDEs), which can solve almost any PDEs. Therefore, this article provides a comprehensive review of the research on AI for PDEs, summarizing the existing algorithms and theories. The article discusses the applications of AI for PDEs in computational mechanics, including solid mechanics, fluid mechanics, and biomechanics. The existing AI for PDEs algorithms include those based on Physics-Informed Neural Networks (PINNs), Deep Energy Methods (DEM), Operator Learning, and Physics-Informed Neural Operator (PINO). AI for PDEs represents a new method of scientific simulation that provides approximate solutions to specific problems using large amounts of data, then fine-tuning according to specific physical laws, avoiding the need to compute from scratch like traditional algorithms. Thus, AI for PDEs is the prototype for future foundation models in computational mechanics, capable of significantly accelerating traditional numerical algorithms. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19843v1-abstract-full').style.display = 'none'; document.getElementById('2410.19843v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.19744">arXiv:2410.19744</a> <span> [<a href="https://arxiv.org/pdf/2410.19744">pdf</a>, <a href="https://arxiv.org/format/2410.19744">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Towards Next-Generation LLM-based Recommender Systems: A Survey and Beyond </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qi Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jindong Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shiqi Wang</a>, <a href="/search/cs?searchtype=author&query=Xing%2C+Q">Qianli Xing</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+R">Runliang Niu</a>, <a href="/search/cs?searchtype=author&query=Kong%2C+H">He Kong</a>, <a href="/search/cs?searchtype=author&query=Li%2C+R">Rui Li</a>, <a href="/search/cs?searchtype=author&query=Long%2C+G">Guodong Long</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+Y">Yi Chang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chengqi Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.19744v1-abstract-short" style="display: inline;"> Large language models (LLMs) have not only revolutionized the field of natural language processing (NLP) but also have the potential to bring a paradigm shift in many other fields due to their remarkable abilities of language understanding, as well as impressive generalization capabilities and reasoning skills. As a result, recent studies have actively attempted to harness the power of LLMs to imp… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19744v1-abstract-full').style.display = 'inline'; document.getElementById('2410.19744v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.19744v1-abstract-full" style="display: none;"> Large language models (LLMs) have not only revolutionized the field of natural language processing (NLP) but also have the potential to bring a paradigm shift in many other fields due to their remarkable abilities of language understanding, as well as impressive generalization capabilities and reasoning skills. As a result, recent studies have actively attempted to harness the power of LLMs to improve recommender systems, and it is imperative to thoroughly review the recent advances and challenges of LLM-based recommender systems. Unlike existing work, this survey does not merely analyze the classifications of LLM-based recommendation systems according to the technical framework of LLMs. Instead, it investigates how LLMs can better serve recommendation tasks from the perspective of the recommender system community, thus enhancing the integration of large language models into the research of recommender system and its practical application. In addition, the long-standing gap between academic research and industrial applications related to recommender systems has not been well discussed, especially in the era of large language models. In this review, we introduce a novel taxonomy that originates from the intrinsic essence of recommendation, delving into the application of large language model-based recommendation systems and their industrial implementation. Specifically, we propose a three-tier structure that more accurately reflects the developmental progression of recommendation systems from research to practical implementation, including representing and understanding, scheming and utilizing, and industrial deployment. Furthermore, we discuss critical challenges and opportunities in this emerging field. A more up-to-date version of the papers is maintained at: https://github.com/jindongli-Ai/Next-Generation-LLM-based-Recommender-Systems-Survey. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19744v1-abstract-full').style.display = 'none'; document.getElementById('2410.19744v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.19400">arXiv:2410.19400</a> <span> [<a href="https://arxiv.org/pdf/2410.19400">pdf</a>, <a href="https://arxiv.org/format/2410.19400">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Offline Reinforcement Learning with OOD State Correction and OOD Action Suppression </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mao%2C+Y">Yixiu Mao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qi Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C">Chen Chen</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+Y">Yun Qu</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+X">Xiangyang Ji</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.19400v4-abstract-short" style="display: inline;"> In offline reinforcement learning (RL), addressing the out-of-distribution (OOD) action issue has been a focus, but we argue that there exists an OOD state issue that also impairs performance yet has been underexplored. Such an issue describes the scenario when the agent encounters states out of the offline dataset during the test phase, leading to uncontrolled behavior and performance degradation… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19400v4-abstract-full').style.display = 'inline'; document.getElementById('2410.19400v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.19400v4-abstract-full" style="display: none;"> In offline reinforcement learning (RL), addressing the out-of-distribution (OOD) action issue has been a focus, but we argue that there exists an OOD state issue that also impairs performance yet has been underexplored. Such an issue describes the scenario when the agent encounters states out of the offline dataset during the test phase, leading to uncontrolled behavior and performance degradation. To this end, we propose SCAS, a simple yet effective approach that unifies OOD state correction and OOD action suppression in offline RL. Technically, SCAS achieves value-aware OOD state correction, capable of correcting the agent from OOD states to high-value in-distribution states. Theoretical and empirical results show that SCAS also exhibits the effect of suppressing OOD actions. On standard offline RL benchmarks, SCAS achieves excellent performance without additional hyperparameter tuning. Moreover, benefiting from its OOD state correction feature, SCAS demonstrates enhanced robustness against environmental perturbations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19400v4-abstract-full').style.display = 'none'; document.getElementById('2410.19400v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18978">arXiv:2410.18978</a> <span> [<a href="https://arxiv.org/pdf/2410.18978">pdf</a>, <a href="https://arxiv.org/format/2410.18978">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Framer: Interactive Frame Interpolation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wen Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qiuyu Wang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+K">Kecheng Zheng</a>, <a href="/search/cs?searchtype=author&query=Ouyang%2C+H">Hao Ouyang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhekai Chen</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+B">Biao Gong</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hao Chen</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+Y">Yujun Shen</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+C">Chunhua Shen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18978v2-abstract-short" style="display: inline;"> We propose Framer for interactive frame interpolation, which targets producing smoothly transitioning frames between two images as per user creativity. Concretely, besides taking the start and end frames as inputs, our approach supports customizing the transition process by tailoring the trajectory of some selected keypoints. Such a design enjoys two clear benefits. First, incorporating human inte… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18978v2-abstract-full').style.display = 'inline'; document.getElementById('2410.18978v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18978v2-abstract-full" style="display: none;"> We propose Framer for interactive frame interpolation, which targets producing smoothly transitioning frames between two images as per user creativity. Concretely, besides taking the start and end frames as inputs, our approach supports customizing the transition process by tailoring the trajectory of some selected keypoints. Such a design enjoys two clear benefits. First, incorporating human interaction mitigates the issue arising from numerous possibilities of transforming one image to another, and in turn enables finer control of local motions. Second, as the most basic form of interaction, keypoints help establish the correspondence across frames, enhancing the model to handle challenging cases (e.g., objects on the start and end frames are of different shapes and styles). It is noteworthy that our system also offers an "autopilot" mode, where we introduce a module to estimate the keypoints and refine the trajectory automatically, to simplify the usage in practice. Extensive experimental results demonstrate the appealing performance of Framer on various applications, such as image morphing, time-lapse video generation, cartoon interpolation, etc. The code, the model, and the interface will be released to facilitate further research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18978v2-abstract-full').style.display = 'none'; document.getElementById('2410.18978v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://aim-uofa.github.io/Framer/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18935">arXiv:2410.18935</a> <span> [<a href="https://arxiv.org/pdf/2410.18935">pdf</a>, <a href="https://arxiv.org/format/2410.18935">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Schema-Guided Culture-Aware Complex Event Simulation with Multi-Agent Role-Play </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+S">Sha Li</a>, <a href="/search/cs?searchtype=author&query=Reddy%2C+R+G">Revanth Gangi Reddy</a>, <a href="/search/cs?searchtype=author&query=Nguyen%2C+K+D">Khanh Duy Nguyen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qingyun Wang</a>, <a href="/search/cs?searchtype=author&query=Fung%2C+M">May Fung</a>, <a href="/search/cs?searchtype=author&query=Han%2C+C">Chi Han</a>, <a href="/search/cs?searchtype=author&query=Han%2C+J">Jiawei Han</a>, <a href="/search/cs?searchtype=author&query=Natarajan%2C+K">Kartik Natarajan</a>, <a href="/search/cs?searchtype=author&query=Voss%2C+C+R">Clare R. Voss</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+H">Heng Ji</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18935v1-abstract-short" style="display: inline;"> Complex news events, such as natural disasters and socio-political conflicts, require swift responses from the government and society. Relying on historical events to project the future is insufficient as such events are sparse and do not cover all possible conditions and nuanced situations. Simulation of these complex events can help better prepare and reduce the negative impact. We develop a con… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18935v1-abstract-full').style.display = 'inline'; document.getElementById('2410.18935v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18935v1-abstract-full" style="display: none;"> Complex news events, such as natural disasters and socio-political conflicts, require swift responses from the government and society. Relying on historical events to project the future is insufficient as such events are sparse and do not cover all possible conditions and nuanced situations. Simulation of these complex events can help better prepare and reduce the negative impact. We develop a controllable complex news event simulator guided by both the event schema representing domain knowledge about the scenario and user-provided assumptions representing case-specific conditions. As event dynamics depend on the fine-grained social and cultural context, we further introduce a geo-diverse commonsense and cultural norm-aware knowledge enhancement component. To enhance the coherence of the simulation, apart from the global timeline of events, we take an agent-based approach to simulate the individual character states, plans, and actions. By incorporating the schema and cultural norms, our generated simulations achieve much higher coherence and appropriateness and are received favorably by participants from a humanitarian assistance organization. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18935v1-abstract-full').style.display = 'none'; document.getElementById('2410.18935v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted as EMNLP 2024 Demo</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18756">arXiv:2410.18756</a> <span> [<a href="https://arxiv.org/pdf/2410.18756">pdf</a>, <a href="https://arxiv.org/format/2410.18756">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Schedule Your Edit: A Simple yet Effective Diffusion Noise Schedule for Image Editing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+H">Haonan Lin</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+M">Mengmeng Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jiahao Wang</a>, <a href="/search/cs?searchtype=author&query=An%2C+W">Wenbin An</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yan Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yong Liu</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+F">Feng Tian</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+G">Guang Dai</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jingdong Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qianying Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18756v3-abstract-short" style="display: inline;"> Text-guided diffusion models have significantly advanced image editing, enabling high-quality and diverse modifications driven by text prompts. However, effective editing requires inverting the source image into a latent space, a process often hindered by prediction errors inherent in DDIM inversion. These errors accumulate during the diffusion process, resulting in inferior content preservation a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18756v3-abstract-full').style.display = 'inline'; document.getElementById('2410.18756v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18756v3-abstract-full" style="display: none;"> Text-guided diffusion models have significantly advanced image editing, enabling high-quality and diverse modifications driven by text prompts. However, effective editing requires inverting the source image into a latent space, a process often hindered by prediction errors inherent in DDIM inversion. These errors accumulate during the diffusion process, resulting in inferior content preservation and edit fidelity, especially with conditional inputs. We address these challenges by investigating the primary contributors to error accumulation in DDIM inversion and identify the singularity problem in traditional noise schedules as a key issue. To resolve this, we introduce the Logistic Schedule, a novel noise schedule designed to eliminate singularities, improve inversion stability, and provide a better noise space for image editing. This schedule reduces noise prediction errors, enabling more faithful editing that preserves the original content of the source image. Our approach requires no additional retraining and is compatible with various existing editing methods. Experiments across eight editing tasks demonstrate the Logistic Schedule's superior performance in content preservation and edit fidelity compared to traditional noise schedules, highlighting its adaptability and effectiveness. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18756v3-abstract-full').style.display = 'none'; document.getElementById('2410.18756v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted in NeurIPS 2024</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Wang%2C+Q&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Wang%2C+Q&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Wang%2C+Q&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Wang%2C+Q&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Wang%2C+Q&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Wang%2C+Q&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository