CINXE.COM

Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1&ndash;50 of 401 results for author: <span class="mathjax">Fan, X</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&amp;query=Fan%2C+X">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Fan, X"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Fan%2C+X&amp;terms-0-field=author&amp;size=50&amp;order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Fan, X"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Fan%2C+X&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Fan%2C+X&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Fan%2C+X&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Fan%2C+X&amp;start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Fan%2C+X&amp;start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Fan%2C+X&amp;start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">&hellip;</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14378">arXiv:2411.14378</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.14378">pdf</a>, <a href="https://arxiv.org/format/2411.14378">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Fluid Dynamics">physics.flu-dyn</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> CoNFiLD-inlet: Synthetic Turbulence Inflow Using Generative Latent Diffusion Models with Neural Fields </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liu%2C+X">Xin-Yang Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Parikh%2C+M+H">Meet Hemant Parikh</a>, <a href="/search/cs?searchtype=author&amp;query=Fan%2C+X">Xiantao Fan</a>, <a href="/search/cs?searchtype=author&amp;query=Du%2C+P">Pan Du</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Q">Qing Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Y">Yi-Fan Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+J">Jian-Xun Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14378v1-abstract-short" style="display: inline;"> Eddy-resolving turbulence simulations require stochastic inflow conditions that accurately replicate the complex, multi-scale structures of turbulence. Traditional recycling-based methods rely on computationally expensive precursor simulations, while existing synthetic inflow generators often fail to reproduce realistic coherent structures of turbulence. Recent advances in deep learning (DL) have&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14378v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14378v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14378v1-abstract-full" style="display: none;"> Eddy-resolving turbulence simulations require stochastic inflow conditions that accurately replicate the complex, multi-scale structures of turbulence. Traditional recycling-based methods rely on computationally expensive precursor simulations, while existing synthetic inflow generators often fail to reproduce realistic coherent structures of turbulence. Recent advances in deep learning (DL) have opened new possibilities for inflow turbulence generation, yet many DL-based methods rely on deterministic, autoregressive frameworks prone to error accumulation, resulting in poor robustness for long-term predictions. In this work, we present CoNFiLD-inlet, a novel DL-based inflow turbulence generator that integrates diffusion models with a conditional neural field (CNF)-encoded latent space to produce realistic, stochastic inflow turbulence. By parameterizing inflow conditions using Reynolds numbers, CoNFiLD-inlet generalizes effectively across a wide range of Reynolds numbers ($Re_蟿$ between $10^3$ and $10^4$) without requiring retraining or parameter tuning. Comprehensive validation through a priori and a posteriori tests in Direct Numerical Simulation (DNS) and Wall-Modeled Large Eddy Simulation (WMLES) demonstrates its high fidelity, robustness, and scalability, positioning it as an efficient and versatile solution for inflow turbulence synthesis. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14378v1-abstract-full').style.display = 'none'; document.getElementById('2411.14378v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">27 pages, 10 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09947">arXiv:2411.09947</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.09947">pdf</a>, <a href="https://arxiv.org/format/2411.09947">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> LoRA-LiteE: A Computationally Efficient Framework for Chatbot Preference-Tuning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yang%2C+Y">Yahe Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Tao%2C+C">Chunliang Tao</a>, <a href="/search/cs?searchtype=author&amp;query=Fan%2C+X">Xiaojing Fan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09947v1-abstract-short" style="display: inline;"> Effective preference tuning is pivotal in aligning chatbot responses with human expectations, enhancing user satisfaction and engagement. Traditional approaches, notably Reinforcement Learning from Human Feedback (RLHF) as employed in advanced models like GPT-4, have demonstrated considerable success in this domain. However, RLHF methods are often computationally intensive and resource-demanding,&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09947v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09947v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09947v1-abstract-full" style="display: none;"> Effective preference tuning is pivotal in aligning chatbot responses with human expectations, enhancing user satisfaction and engagement. Traditional approaches, notably Reinforcement Learning from Human Feedback (RLHF) as employed in advanced models like GPT-4, have demonstrated considerable success in this domain. However, RLHF methods are often computationally intensive and resource-demanding, limiting their scalability and accessibility for broader applications. To address these challenges, this study introduces LoRA-Lite Ensemble (LoRA-LiteE), an innovative framework that combines Supervised Fine-tuning (SFT) with Low-Rank Adaptation (LoRA) and Ensemble Learning techniques to effectively aggregate predictions of lightweight models, which aim to achieve a balance between the performance and computational cost. Utilizing the Chatbot Arena benchmark dataset, we conduct a comprehensive comparative analysis among our LoRA-LiteE model, corresponding base models at different scales, and GPT-4 trained with RLHF. Our empirical results demonstrate that the proposed LoRA-LiteE model achieves comparable performance to un-finetuned GPT-4 and outperforms the single larger-scale models under limited resource constraints. These findings highlight that our LoRA-LiteE provides a feasible and efficient methodology for human preference prediction in chatbot systems, enhancing scalability and accessibility, and thereby broadening the applicability of preference-tuned chatbots in resource-constrained environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09947v1-abstract-full').style.display = 'none'; document.getElementById('2411.09947v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.08651">arXiv:2411.08651</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.08651">pdf</a>, <a href="https://arxiv.org/format/2411.08651">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Estimating unknown parameters in differential equations with a reinforcement learning based PSO method </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Sun%2C+W">Wenkui Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Fan%2C+X">Xiaoya Fan</a>, <a href="/search/cs?searchtype=author&amp;query=Jia%2C+L">Lijuan Jia</a>, <a href="/search/cs?searchtype=author&amp;query=Chu%2C+T">Tinyi Chu</a>, <a href="/search/cs?searchtype=author&amp;query=Yau%2C+S">Shing-Tung Yau</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+R">Rongling Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Z">Zhong Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.08651v1-abstract-short" style="display: inline;"> Differential equations offer a foundational yet powerful framework for modeling interactions within complex dynamic systems and are widely applied across numerous scientific fields. One common challenge in this area is estimating the unknown parameters of these dynamic relationships. However, traditional numerical optimization methods rely on the selection of initial parameter values, making them&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08651v1-abstract-full').style.display = 'inline'; document.getElementById('2411.08651v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.08651v1-abstract-full" style="display: none;"> Differential equations offer a foundational yet powerful framework for modeling interactions within complex dynamic systems and are widely applied across numerous scientific fields. One common challenge in this area is estimating the unknown parameters of these dynamic relationships. However, traditional numerical optimization methods rely on the selection of initial parameter values, making them prone to local optima. Meanwhile, deep learning and Bayesian methods require training models on specific differential equations, resulting in poor versatility. This paper reformulates the parameter estimation problem of differential equations as an optimization problem by introducing the concept of particles from the particle swarm optimization algorithm. Building on reinforcement learning-based particle swarm optimization (RLLPSO), this paper proposes a novel method, DERLPSO, for estimating unknown parameters of differential equations. We compared its performance on three typical ordinary differential equations with the state-of-the-art methods, including the RLLPSO algorithm, traditional numerical methods, deep learning approaches, and Bayesian methods. The experimental results demonstrate that our DERLPSO consistently outperforms other methods in terms of performance, achieving an average Mean Square Error of 1.13e-05, which reduces the error by approximately 4 orders of magnitude compared to other methods. Apart from ordinary differential equations, our DERLPSO also show great promise for estimating unknown parameters of partial differential equations. The DERLPSO method proposed in this paper has high accuracy, is independent of initial parameter values, and possesses strong versatility and stability. This work provides new insights into unknown parameter estimation for differential equations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08651v1-abstract-full').style.display = 'none'; document.getElementById('2411.08651v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07042">arXiv:2411.07042</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.07042">pdf</a>, <a href="https://arxiv.org/format/2411.07042">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> Minion: A Technology Probe for Resolving Value Conflicts through Expert-Driven and User-Driven Strategies in AI Companion Applications </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Fan%2C+X">Xianzhe Fan</a>, <a href="/search/cs?searchtype=author&amp;query=Xiao%2C+Q">Qing Xiao</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+X">Xuhui Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Su%2C+Y">Yuran Su</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+Z">Zhicong Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Sap%2C+M">Maarten Sap</a>, <a href="/search/cs?searchtype=author&amp;query=Shen%2C+H">Hong Shen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07042v1-abstract-short" style="display: inline;"> AI companions based on large language models can role-play and converse very naturally. When value conflicts arise between the AI companion and the user, it may offend or upset the user. Yet, little research has examined such conflicts. We first conducted a formative study that analyzed 151 user complaints about conflicts with AI companions, providing design implications for our study. Based on th&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07042v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07042v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07042v1-abstract-full" style="display: none;"> AI companions based on large language models can role-play and converse very naturally. When value conflicts arise between the AI companion and the user, it may offend or upset the user. Yet, little research has examined such conflicts. We first conducted a formative study that analyzed 151 user complaints about conflicts with AI companions, providing design implications for our study. Based on these, we created Minion, a technology probe to help users resolve human-AI value conflicts. Minion applies a user-empowerment intervention method that provides suggestions by combining expert-driven and user-driven conflict resolution strategies. We conducted a technology probe study, creating 40 value conflict scenarios on Character.AI and Talkie. 22 participants completed 274 tasks and successfully resolved conflicts 94.16% of the time. We summarize user responses, preferences, and needs in resolving value conflicts, and propose design implications to reduce conflicts and empower users to resolve them more effectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07042v1-abstract-full').style.display = 'none'; document.getElementById('2411.07042v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">18 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.21813">arXiv:2410.21813</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.21813">pdf</a>, <a href="https://arxiv.org/format/2410.21813">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SAM-Swin: SAM-Driven Dual-Swin Transformers with Adaptive Lesion Enhancement for Laryngo-Pharyngeal Tumor Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wei%2C+J">Jia Wei</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Yun Li</a>, <a href="/search/cs?searchtype=author&amp;query=Fan%2C+X">Xiaomao Fan</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+W">Wenjun Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Qiu%2C+M">Meiyu Qiu</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+H">Hongyu Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Lei%2C+W">Wenbin Lei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.21813v1-abstract-short" style="display: inline;"> Laryngo-pharyngeal cancer (LPC) is a highly lethal malignancy in the head and neck region. Recent advancements in tumor detection, particularly through dual-branch network architectures, have significantly improved diagnostic accuracy by integrating global and local feature extraction. However, challenges remain in accurately localizing lesions and fully capitalizing on the complementary nature of&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21813v1-abstract-full').style.display = 'inline'; document.getElementById('2410.21813v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.21813v1-abstract-full" style="display: none;"> Laryngo-pharyngeal cancer (LPC) is a highly lethal malignancy in the head and neck region. Recent advancements in tumor detection, particularly through dual-branch network architectures, have significantly improved diagnostic accuracy by integrating global and local feature extraction. However, challenges remain in accurately localizing lesions and fully capitalizing on the complementary nature of features within these branches. To address these issues, we propose SAM-Swin, an innovative SAM-driven Dual-Swin Transformer for laryngo-pharyngeal tumor detection. This model leverages the robust segmentation capabilities of the Segment Anything Model 2 (SAM2) to achieve precise lesion segmentation. Meanwhile, we present a multi-scale lesion-aware enhancement module (MS-LAEM) designed to adaptively enhance the learning of nuanced complementary features across various scales, improving the quality of feature extraction and representation. Furthermore, we implement a multi-scale class-aware guidance (CAG) loss that delivers multi-scale targeted supervision, thereby enhancing the model&#39;s capacity to extract class-specific features. To validate our approach, we compiled three LPC datasets from the First Affiliated Hospital (FAHSYSU), the Sixth Affiliated Hospital (SAHSYSU) of Sun Yat-sen University, and Nanfang Hospital of Southern Medical University (NHSMU). The FAHSYSU dataset is utilized for internal training, while the SAHSYSU and NHSMU datasets serve for external evaluation. Extensive experiments demonstrate that SAM-Swin outperforms state-of-the-art methods, showcasing its potential for advancing LPC detection and improving patient outcomes. The source code of SAM-Swin is available at the URL of \href{https://github.com/VVJia/SAM-Swin}{https://github.com/VVJia/SAM-Swin}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21813v1-abstract-full').style.display = 'none'; document.getElementById('2410.21813v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.19512">arXiv:2410.19512</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.19512">pdf</a>, <a href="https://arxiv.org/format/2410.19512">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Marked Temporal Bayesian Flow Point Processes </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Chen%2C+H">Hui Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Fan%2C+X">Xuhui Fan</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+H">Hengyu Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Cao%2C+L">Longbing Cao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.19512v1-abstract-short" style="display: inline;"> Marked event data captures events by recording their continuous-valued occurrence timestamps along with their corresponding discrete-valued types. They have appeared in various real-world scenarios such as social media, financial transactions, and healthcare records, and have been effectively modeled through Marked Temporal Point Process (MTPP) models. Recently, developing generative models for th&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19512v1-abstract-full').style.display = 'inline'; document.getElementById('2410.19512v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.19512v1-abstract-full" style="display: none;"> Marked event data captures events by recording their continuous-valued occurrence timestamps along with their corresponding discrete-valued types. They have appeared in various real-world scenarios such as social media, financial transactions, and healthcare records, and have been effectively modeled through Marked Temporal Point Process (MTPP) models. Recently, developing generative models for these MTPP models have seen rapid development due to their powerful generative capability and less restrictive functional forms. However, existing generative MTPP models are usually challenged in jointly modeling events&#39; timestamps and types since: (1) mainstream methods design the generative mechanisms for timestamps only and do not include event types; (2) the complex interdependence between the timestamps and event types are overlooked. In this paper, we propose a novel generative MTPP model called BMTPP. Unlike existing generative MTPP models, BMTPP flexibly models marked temporal joint distributions using a parameter-based approach. Additionally, by adding joint noise to the marked temporal data space, BMTPP effectively captures and explicitly reveals the interdependence between timestamps and event types. Extensive experiments validate the superiority of our approach over other state-of-the-art models and its ability to effectively capture marked-temporal interdependence. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19512v1-abstract-full').style.display = 'none'; document.getElementById('2410.19512v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.19153">arXiv:2410.19153</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.19153">pdf</a>, <a href="https://arxiv.org/format/2410.19153">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Learning Coupled Subspaces for Multi-Condition Spike Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Nadew%2C+Y+Y">Yididiya Y. Nadew</a>, <a href="/search/cs?searchtype=author&amp;query=Fan%2C+X">Xuhui Fan</a>, <a href="/search/cs?searchtype=author&amp;query=Quinn%2C+C+J">Christopher J. Quinn</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.19153v1-abstract-short" style="display: inline;"> In neuroscience, researchers typically conduct experiments under multiple conditions to acquire neural responses in the form of high-dimensional spike train datasets. Analysing high-dimensional spike data is a challenging statistical problem. To this end, Gaussian process factor analysis (GPFA), a popular class of latent variable models has been proposed. GPFA extracts smooth, low-dimensional late&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19153v1-abstract-full').style.display = 'inline'; document.getElementById('2410.19153v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.19153v1-abstract-full" style="display: none;"> In neuroscience, researchers typically conduct experiments under multiple conditions to acquire neural responses in the form of high-dimensional spike train datasets. Analysing high-dimensional spike data is a challenging statistical problem. To this end, Gaussian process factor analysis (GPFA), a popular class of latent variable models has been proposed. GPFA extracts smooth, low-dimensional latent trajectories underlying high-dimensional spike train datasets. However, such analyses are often done separately for each experimental condition, contrary to the nature of neural datasets, which contain recordings under multiple experimental conditions. Exploiting the parametric nature of these conditions, we propose a multi-condition GPFA model and inference procedure to learn the underlying latent structure in the corresponding datasets in sample-efficient manner. In particular, we propose a non-parametric Bayesian approach to learn a smooth tuning function over the experiment condition space. Our approach not only boosts model accuracy and is faster, but also improves model interpretability compared to approaches that separately fit models for each experimental condition. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19153v1-abstract-full').style.display = 'none'; document.getElementById('2410.19153v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">27 pages, 3 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18798">arXiv:2410.18798</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.18798">pdf</a>, <a href="https://arxiv.org/format/2410.18798">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Distill Visual Chart Reasoning Ability from LLMs to MLLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=He%2C+W">Wei He</a>, <a href="/search/cs?searchtype=author&amp;query=Xi%2C+Z">Zhiheng Xi</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+W">Wanxu Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Fan%2C+X">Xiaoran Fan</a>, <a href="/search/cs?searchtype=author&amp;query=Ding%2C+Y">Yiwen Ding</a>, <a href="/search/cs?searchtype=author&amp;query=Shan%2C+Z">Zifei Shan</a>, <a href="/search/cs?searchtype=author&amp;query=Gui%2C+T">Tao Gui</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Q">Qi Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+X">Xuanjing Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18798v1-abstract-short" style="display: inline;"> Solving complex chart Q&amp;A tasks requires advanced visual reasoning abilities in multimodal large language models (MLLMs). Recent studies highlight that these abilities consist of two main parts: recognizing key information from visual inputs and conducting reasoning over it. Thus, a promising approach to enhance MLLMs is to construct relevant training data focusing on the two aspects. However, col&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18798v1-abstract-full').style.display = 'inline'; document.getElementById('2410.18798v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18798v1-abstract-full" style="display: none;"> Solving complex chart Q&amp;A tasks requires advanced visual reasoning abilities in multimodal large language models (MLLMs). Recent studies highlight that these abilities consist of two main parts: recognizing key information from visual inputs and conducting reasoning over it. Thus, a promising approach to enhance MLLMs is to construct relevant training data focusing on the two aspects. However, collecting and annotating complex charts and questions is costly and time-consuming, and ensuring the quality of annotated answers remains a challenge. In this paper, we propose Code-as-Intermediary Translation (CIT), a cost-effective, efficient and easily scalable data synthesis method for distilling visual reasoning abilities from LLMs to MLLMs. The code serves as an intermediary that translates visual chart representations into textual representations, enabling LLMs to understand cross-modal information. Specifically, we employ text-based synthesizing techniques to construct chart-plotting code and produce ReachQA, a dataset containing 3k reasoning-intensive charts and 20k Q&amp;A pairs to enhance both recognition and reasoning abilities. Experiments show that when fine-tuned with our data, models not only perform well on chart-related benchmarks, but also demonstrate improved multimodal reasoning abilities on general mathematical benchmarks like MathVista. The code and dataset are publicly available at https://github.com/hewei2001/ReachQA. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18798v1-abstract-full').style.display = 'none'; document.getElementById('2410.18798v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Under review. The code and dataset are publicly available at https://github.com/hewei2001/ReachQA</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.13905">arXiv:2410.13905</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.13905">pdf</a>, <a href="https://arxiv.org/format/2410.13905">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> P4GCN: Vertical Federated Social Recommendation with Privacy-Preserving Two-Party Graph Convolution Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Z">Zheng Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+W">Wanwan Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+Y">Yimin Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Peng%2C+Z">Zhaopeng Peng</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+Z">Ziqi Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+C">Cheng Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Fan%2C+X">Xiaoliang Fan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.13905v1-abstract-short" style="display: inline;"> In recent years, graph neural networks (GNNs) have been commonly utilized for social recommendation systems. However, real-world scenarios often present challenges related to user privacy and business constraints, inhibiting direct access to valuable social information from other platforms. While many existing methods have tackled matrix factorization-based social recommendations without direct so&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13905v1-abstract-full').style.display = 'inline'; document.getElementById('2410.13905v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.13905v1-abstract-full" style="display: none;"> In recent years, graph neural networks (GNNs) have been commonly utilized for social recommendation systems. However, real-world scenarios often present challenges related to user privacy and business constraints, inhibiting direct access to valuable social information from other platforms. While many existing methods have tackled matrix factorization-based social recommendations without direct social data access, developing GNN-based federated social recommendation models under similar conditions remains largely unexplored. To address this issue, we propose a novel vertical federated social recommendation method leveraging privacy-preserving two-party graph convolution networks (P4GCN) to enhance recommendation accuracy without requiring direct access to sensitive social information. First, we introduce a Sandwich-Encryption module to ensure comprehensive data privacy during the collaborative computing process. Second, we provide a thorough theoretical analysis of the privacy guarantees, considering the participation of both curious and honest parties. Extensive experiments on four real-world datasets demonstrate that P4GCN outperforms state-of-the-art methods in terms of recommendation accuracy. The code is available at https://github.com/WwZzz/P4GCN. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13905v1-abstract-full').style.display = 'none'; document.getElementById('2410.13905v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.11302">arXiv:2410.11302</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.11302">pdf</a>, <a href="https://arxiv.org/format/2410.11302">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Have the VLMs Lost Confidence? A Study of Sycophancy in VLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+S">Shuo Li</a>, <a href="/search/cs?searchtype=author&amp;query=Ji%2C+T">Tao Ji</a>, <a href="/search/cs?searchtype=author&amp;query=Fan%2C+X">Xiaoran Fan</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+L">Linsheng Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+L">Leyi Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+Y">Yuming Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Xi%2C+Z">Zhiheng Xi</a>, <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+R">Rui Zheng</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yuran Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+X">Xiaohui Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Gui%2C+T">Tao Gui</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Q">Qi Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+X">Xuanjing Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.11302v1-abstract-short" style="display: inline;"> In the study of LLMs, sycophancy represents a prevalent hallucination that poses significant challenges to these models. Specifically, LLMs often fail to adhere to original correct responses, instead blindly agreeing with users&#39; opinions, even when those opinions are incorrect or malicious. However, research on sycophancy in visual language models (VLMs) has been scarce. In this work, we extend th&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11302v1-abstract-full').style.display = 'inline'; document.getElementById('2410.11302v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.11302v1-abstract-full" style="display: none;"> In the study of LLMs, sycophancy represents a prevalent hallucination that poses significant challenges to these models. Specifically, LLMs often fail to adhere to original correct responses, instead blindly agreeing with users&#39; opinions, even when those opinions are incorrect or malicious. However, research on sycophancy in visual language models (VLMs) has been scarce. In this work, we extend the exploration of sycophancy from LLMs to VLMs, introducing the MM-SY benchmark to evaluate this phenomenon. We present evaluation results from multiple representative models, addressing the gap in sycophancy research for VLMs. To mitigate sycophancy, we propose a synthetic dataset for training and employ methods based on prompts, supervised fine-tuning, and DPO. Our experiments demonstrate that these methods effectively alleviate sycophancy in VLMs. Additionally, we probe VLMs to assess the semantic impact of sycophancy and analyze the attention distribution of visual tokens. Our findings indicate that the ability to prevent sycophancy is predominantly observed in higher layers of the model. The lack of attention to image knowledge in these higher layers may contribute to sycophancy, and enhancing image attention at high layers proves beneficial in mitigating this issue. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11302v1-abstract-full').style.display = 'none'; document.getElementById('2410.11302v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.08249">arXiv:2410.08249</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.08249">pdf</a>, <a href="https://arxiv.org/format/2410.08249">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Federated Graph Learning for Cross-Domain Recommendation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yang%2C+Z">Ziqi Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Peng%2C+Z">Zhaopeng Peng</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Z">Zihui Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Qi%2C+J">Jianzhong Qi</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+C">Chaochao Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Pan%2C+W">Weike Pan</a>, <a href="/search/cs?searchtype=author&amp;query=Wen%2C+C">Chenglu Wen</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+C">Cheng Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Fan%2C+X">Xiaoliang Fan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.08249v2-abstract-short" style="display: inline;"> Cross-domain recommendation (CDR) offers a promising solution to the data sparsity problem by enabling knowledge transfer across source and target domains. However, many recent CDR models overlook crucial issues such as privacy as well as the risk of negative transfer (which negatively impact model performance), especially in multi-domain settings. To address these challenges, we propose FedGCDR,&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08249v2-abstract-full').style.display = 'inline'; document.getElementById('2410.08249v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.08249v2-abstract-full" style="display: none;"> Cross-domain recommendation (CDR) offers a promising solution to the data sparsity problem by enabling knowledge transfer across source and target domains. However, many recent CDR models overlook crucial issues such as privacy as well as the risk of negative transfer (which negatively impact model performance), especially in multi-domain settings. To address these challenges, we propose FedGCDR, a novel federated graph learning framework that securely and effectively leverages positive knowledge from multiple source domains. First, we design a positive knowledge transfer module that ensures privacy during inter-domain knowledge transmission. This module employs differential privacy-based knowledge extraction combined with a feature mapping mechanism, transforming source domain embeddings from federated graph attention networks into reliable domain knowledge. Second, we design a knowledge activation module to filter out potential harmful or conflicting knowledge from source domains, addressing the issues of negative transfer. This module enhances target domain training by expanding the graph of the target domain to generate reliable domain attentions and fine-tunes the target model for improved negative knowledge filtering and more accurate predictions. We conduct extensive experiments on 16 popular domains of the Amazon dataset, demonstrating that FedGCDR significantly outperforms state-of-the-art methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08249v2-abstract-full').style.display = 'none'; document.getElementById('2410.08249v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by NeurIPS&#39;24</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.06340">arXiv:2410.06340</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.06340">pdf</a>, <a href="https://arxiv.org/format/2410.06340">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> FedGraph: A Research Library and Benchmark for Federated Graph Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yao%2C+Y">Yuhang Yao</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Yuan Li</a>, <a href="/search/cs?searchtype=author&amp;query=Fan%2C+X">Xinyi Fan</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+J">Junhao Li</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+K">Kay Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Jin%2C+W">Weizhao Jin</a>, <a href="/search/cs?searchtype=author&amp;query=Ravi%2C+S">Srivatsan Ravi</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+P+S">Philip S. Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Joe-Wong%2C+C">Carlee Joe-Wong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.06340v2-abstract-short" style="display: inline;"> Federated graph learning is an emerging field with significant practical challenges. While many algorithms have been proposed to enhance the accuracy of training graph neural networks, e.g., for node classification problems on large graphs, in a federated manner, their system performance is often overlooked, even though it is crucial for real-world deployment. To address this gap, we introduce Fed&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06340v2-abstract-full').style.display = 'inline'; document.getElementById('2410.06340v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.06340v2-abstract-full" style="display: none;"> Federated graph learning is an emerging field with significant practical challenges. While many algorithms have been proposed to enhance the accuracy of training graph neural networks, e.g., for node classification problems on large graphs, in a federated manner, their system performance is often overlooked, even though it is crucial for real-world deployment. To address this gap, we introduce FedGraph, a research library built for practical distributed deployment and benchmarking in federated graph learning. FedGraph supports a range of state-of-the-art graph learning methods and includes built-in profiling tools to evaluate system performance, focusing specifically on communication and computation costs during training. Unlike existing benchmark platforms, FedGraph natively incorporates homomorphic encryption to enhance privacy preservation and facilitates the development of practical applications by enabling distributed training across multiple physical machines, providing an evaluation framework that can guide the system design of future federated graph learning algorithms. Leveraging these optimizations, we use FedGraph to demonstrate the first privacy-preserving federated learning system to run on graphs with 100 million nodes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06340v2-abstract-full').style.display = 'none'; document.getElementById('2410.06340v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">https://github.com/FedGraph/fedgraph</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.06151">arXiv:2410.06151</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.06151">pdf</a>, <a href="https://arxiv.org/format/2410.06151">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Quality Diversity Imitation Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wan%2C+Z">Zhenglin Wan</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+X">Xingrui Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Bossens%2C+D+M">David Mark Bossens</a>, <a href="/search/cs?searchtype=author&amp;query=Lyu%2C+Y">Yueming Lyu</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+Q">Qing Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Fan%2C+F+X">Flint Xiaofeng Fan</a>, <a href="/search/cs?searchtype=author&amp;query=Tsang%2C+I">Ivor Tsang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.06151v1-abstract-short" style="display: inline;"> Imitation learning (IL) has shown great potential in various applications, such as robot control. However, traditional IL methods are usually designed to learn only one specific type of behavior since demonstrations typically correspond to a single expert. In this work, we introduce the first generic framework for Quality Diversity Imitation Learning (QD-IL), which enables the agent to learn a bro&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06151v1-abstract-full').style.display = 'inline'; document.getElementById('2410.06151v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.06151v1-abstract-full" style="display: none;"> Imitation learning (IL) has shown great potential in various applications, such as robot control. However, traditional IL methods are usually designed to learn only one specific type of behavior since demonstrations typically correspond to a single expert. In this work, we introduce the first generic framework for Quality Diversity Imitation Learning (QD-IL), which enables the agent to learn a broad range of skills from limited demonstrations. Our framework integrates the principles of quality diversity with adversarial imitation learning (AIL) methods, and can potentially improve any inverse reinforcement learning (IRL) method. Empirically, our framework significantly improves the QD performance of GAIL and VAIL on the challenging continuous control tasks derived from Mujoco environments. Moreover, our method even achieves 2x expert performance in the most challenging Humanoid environment. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06151v1-abstract-full').style.display = 'none'; document.getElementById('2410.06151v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">22 pages, conference paper</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.05637">arXiv:2410.05637</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.05637">pdf</a>, <a href="https://arxiv.org/format/2410.05637">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Federated Neural Nonparametric Point Processes </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Chen%2C+H">Hui Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+H">Hengyu Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Yaqiong Li</a>, <a href="/search/cs?searchtype=author&amp;query=Fan%2C+X">Xuhui Fan</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+Z">Zhilin Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+F">Feng Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Quinn%2C+C+J">Christopher John Quinn</a>, <a href="/search/cs?searchtype=author&amp;query=Cao%2C+L">Longbing Cao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.05637v1-abstract-short" style="display: inline;"> Temporal point processes (TPPs) are effective for modeling event occurrences over time, but they struggle with sparse and uncertain events in federated systems, where privacy is a major concern. To address this, we propose \textit{FedPP}, a Federated neural nonparametric Point Process model. FedPP integrates neural embeddings into Sigmoidal Gaussian Cox Processes (SGCPs) on the client side, which&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05637v1-abstract-full').style.display = 'inline'; document.getElementById('2410.05637v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.05637v1-abstract-full" style="display: none;"> Temporal point processes (TPPs) are effective for modeling event occurrences over time, but they struggle with sparse and uncertain events in federated systems, where privacy is a major concern. To address this, we propose \textit{FedPP}, a Federated neural nonparametric Point Process model. FedPP integrates neural embeddings into Sigmoidal Gaussian Cox Processes (SGCPs) on the client side, which is a flexible and expressive class of TPPs, allowing it to generate highly flexible intensity functions that capture client-specific event dynamics and uncertainties while efficiently summarizing historical records. For global aggregation, FedPP introduces a divergence-based mechanism that communicates the distributions of SGCPs&#39; kernel hyperparameters between the server and clients, while keeping client-specific parameters local to ensure privacy and personalization. FedPP effectively captures event uncertainty and sparsity, and extensive experiments demonstrate its superior performance in federated settings, particularly with KL divergence and Wasserstein distance-based global aggregation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05637v1-abstract-full').style.display = 'none'; document.getElementById('2410.05637v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.03581">arXiv:2410.03581</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.03581">pdf</a>, <a href="https://arxiv.org/format/2410.03581">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Nonstationary Sparse Spectral Permanental Process </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Sun%2C+Z">Zicheng Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Yixuan Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Ling%2C+Z">Zenan Ling</a>, <a href="/search/cs?searchtype=author&amp;query=Fan%2C+X">Xuhui Fan</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+F">Feng Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.03581v2-abstract-short" style="display: inline;"> Existing permanental processes often impose constraints on kernel types or stationarity, limiting the model&#39;s expressiveness. To overcome these limitations, we propose a novel approach utilizing the sparse spectral representation of nonstationary kernels. This technique relaxes the constraints on kernel types and stationarity, allowing for more flexible modeling while reducing computational comple&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03581v2-abstract-full').style.display = 'inline'; document.getElementById('2410.03581v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.03581v2-abstract-full" style="display: none;"> Existing permanental processes often impose constraints on kernel types or stationarity, limiting the model&#39;s expressiveness. To overcome these limitations, we propose a novel approach utilizing the sparse spectral representation of nonstationary kernels. This technique relaxes the constraints on kernel types and stationarity, allowing for more flexible modeling while reducing computational complexity to the linear level. Additionally, we introduce a deep kernel variant by hierarchically stacking multiple spectral feature mappings, further enhancing the model&#39;s expressiveness to capture complex patterns in data. Experimental results on both synthetic and real-world datasets demonstrate the effectiveness of our approach, particularly in scenarios with pronounced data nonstationarity. Additionally, ablation studies are conducted to provide insights into the impact of various hyperparameters on model performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03581v2-abstract-full').style.display = 'none'; document.getElementById('2410.03581v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.03212">arXiv:2410.03212</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.03212">pdf</a>, <a href="https://arxiv.org/format/2410.03212">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Data-Efficient Massive Tool Retrieval: A Reinforcement Learning Approach for Query-Tool Alignment with Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Yuxiang Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Fan%2C+X">Xin Fan</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+J">Junjie Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+C">Chongxian Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Mo%2C+F">Fan Mo</a>, <a href="/search/cs?searchtype=author&amp;query=Sakai%2C+T">Tetsuya Sakai</a>, <a href="/search/cs?searchtype=author&amp;query=Yamana%2C+H">Hayato Yamana</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.03212v1-abstract-short" style="display: inline;"> Recent advancements in large language models (LLMs) integrated with external tools and APIs have successfully addressed complex tasks by using in-context learning or fine-tuning. Despite this progress, the vast scale of tool retrieval remains challenging due to stringent input length constraints. In response, we propose a pre-retrieval strategy from an extensive repository, effectively framing the&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03212v1-abstract-full').style.display = 'inline'; document.getElementById('2410.03212v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.03212v1-abstract-full" style="display: none;"> Recent advancements in large language models (LLMs) integrated with external tools and APIs have successfully addressed complex tasks by using in-context learning or fine-tuning. Despite this progress, the vast scale of tool retrieval remains challenging due to stringent input length constraints. In response, we propose a pre-retrieval strategy from an extensive repository, effectively framing the problem as the massive tool retrieval (MTR) task. We introduce the MTRB (massive tool retrieval benchmark) to evaluate real-world tool-augmented LLM scenarios with a large number of tools. This benchmark is designed for low-resource scenarios and includes a diverse collection of tools with descriptions refined for consistency and clarity. It consists of three subsets, each containing 90 test samples and 10 training samples. To handle the low-resource MTR task, we raise a new query-tool alignment (QTA) framework leverages LLMs to enhance query-tool alignment by rewriting user queries through ranking functions and the direct preference optimization (DPO) method. This approach consistently outperforms existing state-of-the-art models in top-5 and top-10 retrieval tasks across the MTRB benchmark, with improvements up to 93.28% based on the metric Sufficiency@k, which measures the adequacy of tool retrieval within the first k results. Furthermore, ablation studies validate the efficacy of our framework, highlighting its capacity to optimize performance even with limited annotated samples. Specifically, our framework achieves up to 78.53% performance improvement in Sufficiency@k with just a single annotated sample. Additionally, QTA exhibits strong cross-dataset generalizability, emphasizing its potential for real-world applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03212v1-abstract-full').style.display = 'none'; document.getElementById('2410.03212v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.02234">arXiv:2410.02234</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.02234">pdf</a>, <a href="https://arxiv.org/format/2410.02234">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Data Structures and Algorithms">cs.DS</span> </div> </div> <p class="title is-5 mathjax"> GORAM: Graph-oriented ORAM for Efficient Ego-centric Queries on Federated Graphs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Fan%2C+X">Xiaoyu Fan</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+K">Kun Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+J">Jiping Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+X">Xiaowei Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Y">Yunyi Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+H">Huanchen Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+W">Wei Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.02234v1-abstract-short" style="display: inline;"> Ego-centric queries, focusing on a target vertex and its direct neighbors, are essential for various applications. Enabling such queries on graphs owned by mutually distrustful data providers, without breaching privacy, holds promise for more comprehensive results. In this paper, we propose GORAM, a graph-oriented data structure that enables efficient ego-centric queries on federated graphs with&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02234v1-abstract-full').style.display = 'inline'; document.getElementById('2410.02234v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.02234v1-abstract-full" style="display: none;"> Ego-centric queries, focusing on a target vertex and its direct neighbors, are essential for various applications. Enabling such queries on graphs owned by mutually distrustful data providers, without breaching privacy, holds promise for more comprehensive results. In this paper, we propose GORAM, a graph-oriented data structure that enables efficient ego-centric queries on federated graphs with strong privacy guarantees. GORAM is built upon secure multi-party computation (MPC) and ensures that no single party can learn any sensitive information about the graph data or the querying keys during the process. However, achieving practical performance with privacy guaranteed presents a challenge. To overcome this, GORAM is designed to partition the federated graph and construct an Oblivious RAM(ORAM)-inspired index atop these partitions. This design enables each ego-centric query to process only a single partition, which can be accessed fast and securely. To evaluate the performance of GORAM, we developed a prototype querying engine on a real-world MPC framework. We conduct a comprehensive evaluation with five commonly used queries on both synthetic and real-world graphs. Our evaluation shows that all benchmark queries can be completed in just 58.1 milliseconds to 35.7 seconds, even on graphs with up to 41.6 million vertices and 1.4 billion edges. To the best of our knowledge, this represents the first instance of processing billion-scale graphs with practical performance on MPC. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02234v1-abstract-full').style.display = 'none'; document.getElementById('2410.02234v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.01769">arXiv:2410.01769</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.01769">pdf</a>, <a href="https://arxiv.org/format/2410.01769">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Quantifying Generalization Complexity for Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Qi%2C+Z">Zhenting Qi</a>, <a href="/search/cs?searchtype=author&amp;query=Luo%2C+H">Hongyin Luo</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+X">Xuliang Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+Z">Zhuokai Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+Y">Yibo Jiang</a>, <a href="/search/cs?searchtype=author&amp;query=Fan%2C+X">Xiangjun Fan</a>, <a href="/search/cs?searchtype=author&amp;query=Lakkaraju%2C+H">Himabindu Lakkaraju</a>, <a href="/search/cs?searchtype=author&amp;query=Glass%2C+J">James Glass</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.01769v2-abstract-short" style="display: inline;"> While large language models (LLMs) have shown exceptional capabilities in understanding complex queries and performing sophisticated tasks, their generalization abilities are often deeply entangled with memorization, necessitating more precise evaluation. To address this challenge, we introduce Scylla, a dynamic evaluation framework that quantitatively measures the generalization abilities of LLMs&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01769v2-abstract-full').style.display = 'inline'; document.getElementById('2410.01769v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.01769v2-abstract-full" style="display: none;"> While large language models (LLMs) have shown exceptional capabilities in understanding complex queries and performing sophisticated tasks, their generalization abilities are often deeply entangled with memorization, necessitating more precise evaluation. To address this challenge, we introduce Scylla, a dynamic evaluation framework that quantitatively measures the generalization abilities of LLMs. Scylla disentangles generalization from memorization via assessing model performance on both in-distribution (ID) and out-of-distribution (OOD) data through 20 tasks across 5 levels of complexity. Through extensive experiments, we uncover a non-monotonic relationship between task complexity and the performance gap between ID and OOD data, which we term the generalization valley. Specifically, this phenomenon reveals a critical threshold - referred to as critical complexity - where reliance on non-generalizable behavior peaks, indicating the upper bound of LLMs&#39; generalization capabilities. As model size increases, the critical complexity shifts toward higher levels of task complexity, suggesting that larger models can handle more complex reasoning tasks before over-relying on memorization. Leveraging Scylla and the concept of critical complexity, we benchmark 28LLMs including both open-sourced models such as LLaMA and Qwen families, and close-sourced models like Claude and GPT, providing a more robust evaluation and establishing a clearer understanding of LLMs&#39; generalization capabilities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01769v2-abstract-full').style.display = 'none'; document.getElementById('2410.01769v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.15690">arXiv:2409.15690</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.15690">pdf</a>, <a href="https://arxiv.org/format/2409.15690">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> </div> </div> <p class="title is-5 mathjax"> A Survey of Stance Detection on Social Media: New Directions and Perspectives </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+B">Bowen Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Dai%2C+G">Genan Dai</a>, <a href="/search/cs?searchtype=author&amp;query=Niu%2C+F">Fuqiang Niu</a>, <a href="/search/cs?searchtype=author&amp;query=Yin%2C+N">Nan Yin</a>, <a href="/search/cs?searchtype=author&amp;query=Fan%2C+X">Xiaomao Fan</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+H">Hu Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.15690v1-abstract-short" style="display: inline;"> In modern digital environments, users frequently express opinions on contentious topics, providing a wealth of information on prevailing attitudes. The systematic analysis of these opinions offers valuable insights for decision-making in various sectors, including marketing and politics. As a result, stance detection has emerged as a crucial subfield within affective computing, enabling the automa&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15690v1-abstract-full').style.display = 'inline'; document.getElementById('2409.15690v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.15690v1-abstract-full" style="display: none;"> In modern digital environments, users frequently express opinions on contentious topics, providing a wealth of information on prevailing attitudes. The systematic analysis of these opinions offers valuable insights for decision-making in various sectors, including marketing and politics. As a result, stance detection has emerged as a crucial subfield within affective computing, enabling the automatic detection of user stances in social media conversations and providing a nuanced understanding of public sentiment on complex issues. Recent years have seen a surge of research interest in developing effective stance detection methods, with contributions from multiple communities, including natural language processing, web science, and social computing. This paper provides a comprehensive survey of stance detection techniques on social media, covering task definitions, datasets, approaches, and future works. We review traditional stance detection models, as well as state-of-the-art methods based on large language models, and discuss their strengths and limitations. Our survey highlights the importance of stance detection in understanding public opinion and sentiment, and identifies gaps in current research. We conclude by outlining potential future directions for stance detection on social media, including the need for more robust and generalizable models, and the importance of addressing emerging challenges such as multi-modal stance detection and stance detection in low-resource languages. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15690v1-abstract-full').style.display = 'none'; document.getElementById('2409.15690v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.14368">arXiv:2409.14368</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.14368">pdf</a>, <a href="https://arxiv.org/ps/2409.14368">ps</a>, <a href="https://arxiv.org/format/2409.14368">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Evaluating the Quality of Code Comments Generated by Large Language Models for Novice Programmers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Fan%2C+A+X">Aysa Xuemo Fan</a>, <a href="/search/cs?searchtype=author&amp;query=Narayanan%2C+A+B+L">Arun Balajiee Lekshmi Narayanan</a>, <a href="/search/cs?searchtype=author&amp;query=Hassany%2C+M">Mohammad Hassany</a>, <a href="/search/cs?searchtype=author&amp;query=Ke%2C+J">Jiaze Ke</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.14368v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) show promise in generating code comments for novice programmers, but their educational effectiveness remains under-evaluated. This study assesses the instructional quality of code comments produced by GPT-4, GPT-3.5-Turbo, and Llama2, compared to expert-developed comments, focusing on their suitability for novices. Analyzing a dataset of ``easy&#39;&#39; level Java solutions f&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.14368v1-abstract-full').style.display = 'inline'; document.getElementById('2409.14368v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.14368v1-abstract-full" style="display: none;"> Large Language Models (LLMs) show promise in generating code comments for novice programmers, but their educational effectiveness remains under-evaluated. This study assesses the instructional quality of code comments produced by GPT-4, GPT-3.5-Turbo, and Llama2, compared to expert-developed comments, focusing on their suitability for novices. Analyzing a dataset of ``easy&#39;&#39; level Java solutions from LeetCode, we find that GPT-4 exhibits comparable quality to expert comments in aspects critical for beginners, such as clarity, beginner-friendliness, concept elucidation, and step-by-step guidance. GPT-4 outperforms Llama2 in discussing complexity (chi-square = 11.40, p = 0.001) and is perceived as significantly more supportive for beginners than GPT-3.5 and Llama2 with Mann-Whitney U-statistics = 300.5 and 322.5, p = 0.0017 and 0.0003). This study highlights the potential of LLMs for generating code comments tailored to novice programmers. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.14368v1-abstract-full').style.display = 'none'; document.getElementById('2409.14368v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.13371">arXiv:2409.13371</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.13371">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MCICSAM: Monte Carlo-guided Interpolation Consistency Segment Anything Model for Semi-Supervised Prostate Zone Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Huang%2C+G">Guantian Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+B">Beibei Li</a>, <a href="/search/cs?searchtype=author&amp;query=Fan%2C+X">Xiaobing Fan</a>, <a href="/search/cs?searchtype=author&amp;query=Chatterjee%2C+A">Aritrick Chatterjee</a>, <a href="/search/cs?searchtype=author&amp;query=Wei%2C+C">Cheng Wei</a>, <a href="/search/cs?searchtype=author&amp;query=Qi%2C+S">Shouliang Qi</a>, <a href="/search/cs?searchtype=author&amp;query=Qian%2C+W">Wei Qian</a>, <a href="/search/cs?searchtype=author&amp;query=He%2C+D">Dianning He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.13371v1-abstract-short" style="display: inline;"> Accurate segmentation of various regions within the prostate is pivotal for diagnosing and treating prostate-related diseases. However, the scarcity of labeled data, particularly in specialized medical fields like prostate imaging, poses a significant challenge. Segment Anything Model (SAM) is a new large model for natural image segmentation, but there are some challenges in medical imaging. In or&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13371v1-abstract-full').style.display = 'inline'; document.getElementById('2409.13371v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.13371v1-abstract-full" style="display: none;"> Accurate segmentation of various regions within the prostate is pivotal for diagnosing and treating prostate-related diseases. However, the scarcity of labeled data, particularly in specialized medical fields like prostate imaging, poses a significant challenge. Segment Anything Model (SAM) is a new large model for natural image segmentation, but there are some challenges in medical imaging. In order to better utilize the powerful feature extraction capability of SAM as well as to address the problem of low data volume for medical image annotation, we use Low-Rank Adaptation (LoRA) and semi-supervised learning methods of Monte Carlo guided interpolation consistency (MCIC) to enhance the fine-tuned SAM. We propose Monte Carlo-guided Interpolation Consistency Segment Anything Model (MCICSAM) for application to semi-supervised learning based prostate region segmentation. In the unlabeled data section, MCIC performs two different interpolation transformations on the input data and incorporates Monte Carlo uncertainty analysis in the output, forcing the model to be consistent in its predictions. The consistency constraints imposed on these interpolated samples allow the model to fit the distribution of unlabeled data better, ultimately improving its performance in semi-supervised scenarios. We use Dice and Hausdorff Distance at 95th percentile (HD95) to validate model performance. MCICSAM yieldes Dice with 79.38% and 89.95%, along with improves HD95 values of 3.12 and 2.27 for transition zone and transition zone. At the same time MCICSAM demonstrates strong generalizability. This method is expected to bring new possibilities in the field of prostate image segmentation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13371v1-abstract-full').style.display = 'none'; document.getElementById('2409.13371v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.12000">arXiv:2409.12000</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.12000">pdf</a>, <a href="https://arxiv.org/ps/2409.12000">ps</a>, <a href="https://arxiv.org/format/2409.12000">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> </div> </div> <p class="title is-5 mathjax"> &#34;It Might be Technically Impressive, But It&#39;s Practically Useless to Us&#34;: Practices, Challenges, and Opportunities for Cross-Functional Collaboration around AI within the News Industry </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Xiao%2C+Q">Qing Xiao</a>, <a href="/search/cs?searchtype=author&amp;query=Fan%2C+X">Xianzhe Fan</a>, <a href="/search/cs?searchtype=author&amp;query=Simon%2C+F+M">Felix M. Simon</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+B">Bingbing Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Eslami%2C+M">Motahhare Eslami</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.12000v1-abstract-short" style="display: inline;"> Recently, an increasing number of news organizations have integrated artificial intelligence (AI) into their workflows, leading to a further influx of AI technologists and data workers into the news industry. This has initiated cross-functional collaborations between these professionals and journalists. While prior research has explored the impact of AI-related roles entering the news industry, th&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12000v1-abstract-full').style.display = 'inline'; document.getElementById('2409.12000v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.12000v1-abstract-full" style="display: none;"> Recently, an increasing number of news organizations have integrated artificial intelligence (AI) into their workflows, leading to a further influx of AI technologists and data workers into the news industry. This has initiated cross-functional collaborations between these professionals and journalists. While prior research has explored the impact of AI-related roles entering the news industry, there is a lack of studies on how cross-functional collaboration unfolds between AI professionals and journalists. Through interviews with 17 journalists, 6 AI technologists, and 3 AI workers with cross-functional experience from leading news organizations, we investigate the current practices, challenges, and opportunities for cross-functional collaboration around AI in today&#39;s news industry. We first study how journalists and AI professionals perceive existing cross-collaboration strategies. We further explore the challenges of cross-functional collaboration and provide recommendations for enhancing future cross-functional collaboration around AI in the news industry. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12000v1-abstract-full').style.display = 'none'; document.getElementById('2409.12000v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">16 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.11703">arXiv:2409.11703</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.11703">pdf</a>, <a href="https://arxiv.org/format/2409.11703">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Harnessing LLMs for API Interactions: A Framework for Classification and Synthetic Data Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Tao%2C+C">Chunliang Tao</a>, <a href="/search/cs?searchtype=author&amp;query=Fan%2C+X">Xiaojing Fan</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+Y">Yahe Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.11703v1-abstract-short" style="display: inline;"> As Large Language Models (LLMs) advance in natural language processing, there is growing interest in leveraging their capabilities to simplify software interactions. In this paper, we propose a novel system that integrates LLMs for both classifying natural language inputs into corresponding API calls and automating the creation of sample datasets tailored to specific API functions. By classifying&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.11703v1-abstract-full').style.display = 'inline'; document.getElementById('2409.11703v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.11703v1-abstract-full" style="display: none;"> As Large Language Models (LLMs) advance in natural language processing, there is growing interest in leveraging their capabilities to simplify software interactions. In this paper, we propose a novel system that integrates LLMs for both classifying natural language inputs into corresponding API calls and automating the creation of sample datasets tailored to specific API functions. By classifying natural language commands, our system allows users to invoke complex software functionalities through simple inputs, improving interaction efficiency and lowering the barrier to software utilization. Our dataset generation approach also enables the efficient and systematic evaluation of different LLMs in classifying API calls, offering a practical tool for developers or business owners to assess the suitability of LLMs for customized API management. We conduct experiments on several prominent LLMs using generated sample datasets for various API functions. The results show that GPT-4 achieves a high classification accuracy of 0.996, while LLaMA-3-8B performs much worse at 0.759. These findings highlight the potential of LLMs to transform API management and validate the effectiveness of our system in guiding model testing and selection across diverse applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.11703v1-abstract-full').style.display = 'none'; document.getElementById('2409.11703v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.10670">arXiv:2409.10670</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.10670">pdf</a>, <a href="https://arxiv.org/ps/2409.10670">ps</a>, <a href="https://arxiv.org/format/2409.10670">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> </div> </div> <p class="title is-5 mathjax"> Let&#39;s Influence Algorithms Together: How Millions of Fans Build Collective Understanding of Algorithms and Organize Coordinated Algorithmic Actions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Xiao%2C+Q">Qing Xiao</a>, <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+Y">Yuhang Zheng</a>, <a href="/search/cs?searchtype=author&amp;query=Fan%2C+X">Xianzhe Fan</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+B">Bingbing Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+Z">Zhicong Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.10670v1-abstract-short" style="display: inline;"> Previous research pays attention to how users strategically understand and consciously interact with algorithms but mainly focuses on an individual level, making it difficult to explore how users within communities could develop a collective understanding of algorithms and organize collective algorithmic actions. Through a two-year ethnography of online fan activities, this study investigates 43 c&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.10670v1-abstract-full').style.display = 'inline'; document.getElementById('2409.10670v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.10670v1-abstract-full" style="display: none;"> Previous research pays attention to how users strategically understand and consciously interact with algorithms but mainly focuses on an individual level, making it difficult to explore how users within communities could develop a collective understanding of algorithms and organize collective algorithmic actions. Through a two-year ethnography of online fan activities, this study investigates 43 core fans who always organize large-scale fans&#39; collective actions and their corresponding general fan groups. This study aims to reveal how these core fans mobilize millions of general fans through collective algorithmic actions. These core fans reported the rhetorical strategies used to persuade general fans, the steps taken to build a collective understanding of algorithms, and the collaborative processes that adapt collective actions across platforms and cultures. Our findings highlight the key factors that enable computer-supported collective algorithmic actions and extend collective action research into large-scale domain targeting algorithms. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.10670v1-abstract-full').style.display = 'none'; document.getElementById('2409.10670v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.07453">arXiv:2409.07453</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.07453">pdf</a>, <a href="https://arxiv.org/format/2409.07453">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> &#34;My Grade is Wrong!&#34;: A Contestable AI Framework for Interactive Feedback in Evaluating Student Essays </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Hong%2C+S">Shengxin Hong</a>, <a href="/search/cs?searchtype=author&amp;query=Cai%2C+C">Chang Cai</a>, <a href="/search/cs?searchtype=author&amp;query=Du%2C+S">Sixuan Du</a>, <a href="/search/cs?searchtype=author&amp;query=Feng%2C+H">Haiyue Feng</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+S">Siyuan Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Fan%2C+X">Xiuyi Fan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.07453v1-abstract-short" style="display: inline;"> Interactive feedback, where feedback flows in both directions between teacher and student, is more effective than traditional one-way feedback. However, it is often too time-consuming for widespread use in educational practice. While Large Language Models (LLMs) have potential for automating feedback, they struggle with reasoning and interaction in an interactive setting. This paper introduces CAE&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.07453v1-abstract-full').style.display = 'inline'; document.getElementById('2409.07453v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.07453v1-abstract-full" style="display: none;"> Interactive feedback, where feedback flows in both directions between teacher and student, is more effective than traditional one-way feedback. However, it is often too time-consuming for widespread use in educational practice. While Large Language Models (LLMs) have potential for automating feedback, they struggle with reasoning and interaction in an interactive setting. This paper introduces CAELF, a Contestable AI Empowered LLM Framework for automating interactive feedback. CAELF allows students to query, challenge, and clarify their feedback by integrating a multi-agent system with computational argumentation. Essays are first assessed by multiple Teaching-Assistant Agents (TA Agents), and then a Teacher Agent aggregates the evaluations through formal reasoning to generate feedback and grades. Students can further engage with the feedback to refine their understanding. A case study on 500 critical thinking essays with user studies demonstrates that CAELF significantly improves interactive feedback, enhancing the reasoning and interaction capabilities of LLMs. This approach offers a promising solution to overcoming the time and resource barriers that have limited the adoption of interactive feedback in educational settings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.07453v1-abstract-full').style.display = 'none'; document.getElementById('2409.07453v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.07188">arXiv:2409.07188</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.07188">pdf</a>, <a href="https://arxiv.org/format/2409.07188">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Atmospheric and Oceanic Physics">physics.ao-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> FuXi-2.0: Advancing machine learning weather forecasting model for practical applications </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhong%2C+X">Xiaohui Zhong</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+L">Lei Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Fan%2C+X">Xu Fan</a>, <a href="/search/cs?searchtype=author&amp;query=Qian%2C+W">Wenxu Qian</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+J">Jun Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+H">Hao Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.07188v1-abstract-short" style="display: inline;"> Machine learning (ML) models have become increasingly valuable in weather forecasting, providing forecasts that not only lower computational costs but often match or exceed the accuracy of traditional numerical weather prediction (NWP) models. Despite their potential, ML models typically suffer from limitations such as coarse temporal resolution, typically 6 hours, and a limited set of meteorologi&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.07188v1-abstract-full').style.display = 'inline'; document.getElementById('2409.07188v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.07188v1-abstract-full" style="display: none;"> Machine learning (ML) models have become increasingly valuable in weather forecasting, providing forecasts that not only lower computational costs but often match or exceed the accuracy of traditional numerical weather prediction (NWP) models. Despite their potential, ML models typically suffer from limitations such as coarse temporal resolution, typically 6 hours, and a limited set of meteorological variables, limiting their practical applicability. To overcome these challenges, we introduce FuXi-2.0, an advanced ML model that delivers 1-hourly global weather forecasts and includes a comprehensive set of essential meteorological variables, thereby expanding its utility across various sectors like wind and solar energy, aviation, and marine shipping. Our study conducts comparative analyses between ML-based 1-hourly forecasts and those from the high-resolution forecast (HRES) of the European Centre for Medium-Range Weather Forecasts (ECMWF) for various practical scenarios. The results demonstrate that FuXi-2.0 consistently outperforms ECMWF HRES in forecasting key meteorological variables relevant to these sectors. In particular, FuXi-2.0 shows superior performance in wind power forecasting compared to ECMWF HRES, further validating its efficacy as a reliable tool for scenarios demanding precise weather forecasts. Additionally, FuXi-2.0 also integrates both atmospheric and oceanic components, representing a significant step forward in the development of coupled atmospheric-ocean models. Further comparative analyses reveal that FuXi-2.0 provides more accurate forecasts of tropical cyclone intensity than its predecessor, FuXi-1.0, suggesting that there are benefits of an atmosphere-ocean coupled model over atmosphere-only models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.07188v1-abstract-full').style.display = 'none'; document.getElementById('2409.07188v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.05243">arXiv:2409.05243</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.05243">pdf</a>, <a href="https://arxiv.org/format/2409.05243">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Mamba-Enhanced Text-Audio-Video Alignment Network for Emotion Recognition in Conversations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+X">Xinran Li</a>, <a href="/search/cs?searchtype=author&amp;query=Fan%2C+X">Xiaomao Fan</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+Q">Qingyang Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Peng%2C+X">Xiaojiang Peng</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Ye Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.05243v1-abstract-short" style="display: inline;"> Emotion Recognition in Conversations (ERCs) is a vital area within multimodal interaction research, dedicated to accurately identifying and classifying the emotions expressed by speakers throughout a conversation. Traditional ERC approaches predominantly rely on unimodal cues\-such as text, audio, or visual data\-leading to limitations in their effectiveness. These methods encounter two significan&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.05243v1-abstract-full').style.display = 'inline'; document.getElementById('2409.05243v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.05243v1-abstract-full" style="display: none;"> Emotion Recognition in Conversations (ERCs) is a vital area within multimodal interaction research, dedicated to accurately identifying and classifying the emotions expressed by speakers throughout a conversation. Traditional ERC approaches predominantly rely on unimodal cues\-such as text, audio, or visual data\-leading to limitations in their effectiveness. These methods encounter two significant challenges: 1) Consistency in multimodal information. Before integrating various modalities, it is crucial to ensure that the data from different sources is aligned and coherent. 2) Contextual information capture. Successfully fusing multimodal features requires a keen understanding of the evolving emotional tone, especially in lengthy dialogues where emotions may shift and develop over time. To address these limitations, we propose a novel Mamba-enhanced Text-Audio-Video alignment network (MaTAV) for the ERC task. MaTAV is with the advantages of aligning unimodal features to ensure consistency across different modalities and handling long input sequences to better capture contextual multimodal information. The extensive experiments on the MELD and IEMOCAP datasets demonstrate that MaTAV significantly outperforms existing state-of-the-art methods on the ERC task with a big margin. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.05243v1-abstract-full').style.display = 'none'; document.getElementById('2409.05243v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.04927">arXiv:2409.04927</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.04927">pdf</a>, <a href="https://arxiv.org/format/2409.04927">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Just ASR + LLM? A Study on Speech Large Language Models&#39; Ability to Identify and Understand Speaker in Spoken Dialogue </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wu%2C+J">Junkai Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Fan%2C+X">Xulin Fan</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+B">Bo-Ru Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+X">Xilin Jiang</a>, <a href="/search/cs?searchtype=author&amp;query=Mesgarani%2C+N">Nima Mesgarani</a>, <a href="/search/cs?searchtype=author&amp;query=Hasegawa-Johnson%2C+M">Mark Hasegawa-Johnson</a>, <a href="/search/cs?searchtype=author&amp;query=Ostendorf%2C+M">Mari Ostendorf</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.04927v3-abstract-short" style="display: inline;"> In recent years, we have observed a rapid advancement in speech language models (SpeechLLMs), catching up with humans&#39; listening and reasoning abilities. SpeechLLMs have demonstrated impressive spoken dialog question-answering (SQA) performance in benchmarks like Gaokao, the English listening test of the college entrance exam in China, which seemingly requires understanding both the spoken content&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.04927v3-abstract-full').style.display = 'inline'; document.getElementById('2409.04927v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.04927v3-abstract-full" style="display: none;"> In recent years, we have observed a rapid advancement in speech language models (SpeechLLMs), catching up with humans&#39; listening and reasoning abilities. SpeechLLMs have demonstrated impressive spoken dialog question-answering (SQA) performance in benchmarks like Gaokao, the English listening test of the college entrance exam in China, which seemingly requires understanding both the spoken content and voice characteristics of speakers in a conversation. However, after carefully examining Gaokao&#39;s questions, we find the correct answers to many questions can be inferred from the conversation transcript alone, i.e.\ without speaker segmentation and identification. Our evaluation of state-of-the-art models Qwen-Audio and WavLLM on both Gaokao and our proposed &#34;What Do You Like?&#34; dataset shows a significantly higher accuracy in these context-based questions than in identity-critical questions, which can only be answered reliably with correct speaker identification. The results and analysis suggest that when solving SQA, the current SpeechLLMs exhibit limited speaker awareness from the audio and behave similarly to an LLM reasoning from the conversation transcription without sound. We propose that tasks focused on identity-critical questions could offer a more accurate evaluation framework of SpeechLLMs in SQA. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.04927v3-abstract-full').style.display = 'none'; document.getElementById('2409.04927v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to IEEE SLT 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.01459">arXiv:2409.01459</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.01459">pdf</a>, <a href="https://arxiv.org/format/2409.01459">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> 3D-LSPTM: An Automatic Framework with 3D-Large-Scale Pretrained Model for Laryngeal Cancer Detection Using Laryngoscopic Videos </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Qiu%2C+M">Meiyu Qiu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Yun Li</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+W">Wenjun Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+H">Haoyun Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+W">Weiping Zheng</a>, <a href="/search/cs?searchtype=author&amp;query=Lei%2C+W">Wenbin Lei</a>, <a href="/search/cs?searchtype=author&amp;query=Fan%2C+X">Xiaomao Fan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.01459v1-abstract-short" style="display: inline;"> Laryngeal cancer is a malignant disease with a high morality rate in otorhinolaryngology, posing an significant threat to human health. Traditionally larygologists manually visual-inspect laryngeal cancer in laryngoscopic videos, which is quite time-consuming and subjective. In this study, we propose a novel automatic framework via 3D-large-scale pretrained models termed 3D-LSPTM for laryngeal can&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.01459v1-abstract-full').style.display = 'inline'; document.getElementById('2409.01459v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.01459v1-abstract-full" style="display: none;"> Laryngeal cancer is a malignant disease with a high morality rate in otorhinolaryngology, posing an significant threat to human health. Traditionally larygologists manually visual-inspect laryngeal cancer in laryngoscopic videos, which is quite time-consuming and subjective. In this study, we propose a novel automatic framework via 3D-large-scale pretrained models termed 3D-LSPTM for laryngeal cancer detection. Firstly, we collect 1,109 laryngoscopic videos from the First Affiliated Hospital Sun Yat-sen University with the approval of the Ethics Committee. Then we utilize the 3D-large-scale pretrained models of C3D, TimeSformer, and Video-Swin-Transformer, with the merit of advanced featuring videos, for laryngeal cancer detection with fine-tuning techniques. Extensive experiments show that our proposed 3D-LSPTM can achieve promising performance on the task of laryngeal cancer detection. Particularly, 3D-LSPTM with the backbone of Video-Swin-Transformer can achieve 92.4% accuracy, 95.6% sensitivity, 94.1% precision, and 94.8% F_1. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.01459v1-abstract-full').style.display = 'none'; document.getElementById('2409.01459v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.00904">arXiv:2409.00904</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.00904">pdf</a>, <a href="https://arxiv.org/format/2409.00904">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Multi-scale Temporal Fusion Transformer for Incomplete Vehicle Trajectory Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zhanwen Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+C">Chao Li</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yang Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+N">Nan Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Fan%2C+X">Xing Fan</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+J">Jiaqi Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+X">Xiangmo Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.00904v1-abstract-short" style="display: inline;"> Motion prediction plays an essential role in autonomous driving systems, enabling autonomous vehicles to achieve more accurate local-path planning and driving decisions based on predictions of the surrounding vehicles. However, existing methods neglect the potential missing values caused by object occlusion, perception failures, etc., which inevitably degrades the trajectory prediction performance&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00904v1-abstract-full').style.display = 'inline'; document.getElementById('2409.00904v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.00904v1-abstract-full" style="display: none;"> Motion prediction plays an essential role in autonomous driving systems, enabling autonomous vehicles to achieve more accurate local-path planning and driving decisions based on predictions of the surrounding vehicles. However, existing methods neglect the potential missing values caused by object occlusion, perception failures, etc., which inevitably degrades the trajectory prediction performance in real traffic scenarios. To address this limitation, we propose a novel end-to-end framework for incomplete vehicle trajectory prediction, named Multi-scale Temporal Fusion Transformer (MTFT), which consists of the Multi-scale Attention Head (MAH) and the Continuity Representation-guided Multi-scale Fusion (CRMF) module. Specifically, the MAH leverages the multi-head attention mechanism to parallelly capture multi-scale motion representation of trajectory from different temporal granularities, thus mitigating the adverse effect of missing values on prediction. Furthermore, the multi-scale motion representation is input into the CRMF module for multi-scale fusion to obtain the robust temporal feature of the vehicle. During the fusion process, the continuity representation of vehicle motion is first extracted across time steps to guide the fusion, ensuring that the resulting temporal feature incorporates both detailed information and the overall trend of vehicle motion, which facilitates the accurate decoding of future trajectory that is consistent with the vehicle&#39;s motion trend. We evaluate the proposed model on four datasets derived from highway and urban traffic scenarios. The experimental results demonstrate its superior performance in the incomplete vehicle trajectory prediction task compared with state-of-the-art models, e.g., a comprehensive performance improvement of more than 39% on the HighD dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00904v1-abstract-full').style.display = 'none'; document.getElementById('2409.00904v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.00862">arXiv:2409.00862</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.00862">pdf</a>, <a href="https://arxiv.org/format/2409.00862">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> User-Driven Value Alignment: Understanding Users&#39; Perceptions and Strategies for Addressing Biased and Discriminatory Statements in AI Companions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Fan%2C+X">Xianzhe Fan</a>, <a href="/search/cs?searchtype=author&amp;query=Xiao%2C+Q">Qing Xiao</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+X">Xuhui Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Pei%2C+J">Jiaxin Pei</a>, <a href="/search/cs?searchtype=author&amp;query=Sap%2C+M">Maarten Sap</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+Z">Zhicong Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Shen%2C+H">Hong Shen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.00862v1-abstract-short" style="display: inline;"> Large language model-based AI companions are increasingly viewed by users as friends or romantic partners, leading to deep emotional bonds. However, they can generate biased, discriminatory, and harmful outputs. Recently, users are taking the initiative to address these harms and re-align AI companions. We introduce the concept of user-driven value alignment, where users actively identify, challen&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00862v1-abstract-full').style.display = 'inline'; document.getElementById('2409.00862v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.00862v1-abstract-full" style="display: none;"> Large language model-based AI companions are increasingly viewed by users as friends or romantic partners, leading to deep emotional bonds. However, they can generate biased, discriminatory, and harmful outputs. Recently, users are taking the initiative to address these harms and re-align AI companions. We introduce the concept of user-driven value alignment, where users actively identify, challenge, and attempt to correct AI outputs they perceive as harmful, aiming to guide the AI to better align with their values. We analyzed 77 social media posts about discriminatory AI statements and conducted semi-structured interviews with 20 experienced users. Our analysis revealed six common types of discriminatory statements perceived by users, how users make sense of those AI behaviors, and seven user-driven alignment strategies, such as gentle persuasion and anger expression. We discuss implications for supporting user-driven value alignment in future AI systems, where users and their communities have greater agency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00862v1-abstract-full').style.display = 'none'; document.getElementById('2409.00862v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages, 1 figure</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.15516">arXiv:2408.15516</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2408.15516">pdf</a>, <a href="https://arxiv.org/format/2408.15516">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> Predicting Parameter Change&#39;s Effect on Cellular Network Time Series </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+M">Mingjie Li</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+Y">Yongqian Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Hua%2C+X">Xiaolei Hua</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+R">Renkai Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Fan%2C+X">Xinwen Fan</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+L">Lin Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Feng%2C+J">Junlan Feng</a>, <a href="/search/cs?searchtype=author&amp;query=Pei%2C+D">Dan Pei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.15516v1-abstract-short" style="display: inline;"> The cellular network provides convenient network access for ever-growing mobile phones. During the continuous optimization, operators can adjust cell parameters to enhance the Quality of Service (QoS) flexibly. A precise prediction of the parameter change&#39;s effect can help operators make proper parameter adjustments. This work focuses on predicting cell status (like the workload and QoS) after adj&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.15516v1-abstract-full').style.display = 'inline'; document.getElementById('2408.15516v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.15516v1-abstract-full" style="display: none;"> The cellular network provides convenient network access for ever-growing mobile phones. During the continuous optimization, operators can adjust cell parameters to enhance the Quality of Service (QoS) flexibly. A precise prediction of the parameter change&#39;s effect can help operators make proper parameter adjustments. This work focuses on predicting cell status (like the workload and QoS) after adjusting the cell parameters. The prediction will be conducted before an adjustment is actually applied to provide an early inspection. As it can be hard for available parameter adjustments with a limited number to cover all the parameter and user behavior combinations, we propose ParaSeer fusing domain knowledge on parameter adjustments into data-driven time series forecasting. ParaSeer organizes several pre-trained Transformers for adjustment-free time series forecasting, utilizing plenty of adjustment-free data. On the other hand, ParaSeer models the effect of adjusting the transmission power and cell individual offset (CIO) as a multiplier for the workload. We derive a formula to calculate the multiplier from the underlying mechanism of those two parameters, helping ParaSeer eliminate the thirst for data with parameter adjustments. We compare ParaSeer with baselines on two real-world datasets, where ParaSeer outperforms the best baseline by more than 25.8% in terms of RMSE. The extensive experiments further illustrate the contributions of ParaSeer&#39;s components. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.15516v1-abstract-full').style.display = 'none'; document.getElementById('2408.15516v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.13712">arXiv:2408.13712</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2408.13712">pdf</a>, <a href="https://arxiv.org/format/2408.13712">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> Riemann-based Multi-scale Attention Reasoning Network for Text-3D Retrieval </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+W">Wenrui Li</a>, <a href="/search/cs?searchtype=author&amp;query=Han%2C+W">Wei Han</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Y">Yandu Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Chai%2C+Y">Yeyu Chai</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+Y">Yidan Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+X">Xingtao Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Fan%2C+X">Xiaopeng Fan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.13712v1-abstract-short" style="display: inline;"> Due to the challenges in acquiring paired Text-3D data and the inherent irregularity of 3D data structures, combined representation learning of 3D point clouds and text remains unexplored. In this paper, we propose a novel Riemann-based Multi-scale Attention Reasoning Network (RMARN) for text-3D retrieval. Specifically, the extracted text and point cloud features are refined by their respective Ad&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.13712v1-abstract-full').style.display = 'inline'; document.getElementById('2408.13712v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.13712v1-abstract-full" style="display: none;"> Due to the challenges in acquiring paired Text-3D data and the inherent irregularity of 3D data structures, combined representation learning of 3D point clouds and text remains unexplored. In this paper, we propose a novel Riemann-based Multi-scale Attention Reasoning Network (RMARN) for text-3D retrieval. Specifically, the extracted text and point cloud features are refined by their respective Adaptive Feature Refiner (AFR). Furthermore, we introduce the innovative Riemann Local Similarity (RLS) module and the Global Pooling Similarity (GPS) module. However, as 3D point cloud data and text data often possess complex geometric structures in high-dimensional space, the proposed RLS employs a novel Riemann Attention Mechanism to reflect the intrinsic geometric relationships of the data. Without explicitly defining the manifold, RMARN learns the manifold parameters to better represent the distances between text-point cloud samples. To address the challenges of lacking paired text-3D data, we have created the large-scale Text-3D Retrieval dataset T3DR-HIT, which comprises over 3,380 pairs of text and point cloud data. T3DR-HIT contains coarse-grained indoor 3D scenes and fine-grained Chinese artifact scenes, consisting of 1,380 and over 2,000 text-3D pairs, respectively. Experiments on our custom datasets demonstrate the superior performance of the proposed method. Our code and proposed datasets are available at \url{https://github.com/liwrui/RMARN}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.13712v1-abstract-full').style.display = 'none'; document.getElementById('2408.13712v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.13711">arXiv:2408.13711</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2408.13711">pdf</a>, <a href="https://arxiv.org/format/2408.13711">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> SceneDreamer360: Text-Driven 3D-Consistent Scene Generation with Panoramic Gaussian Splatting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+W">Wenrui Li</a>, <a href="/search/cs?searchtype=author&amp;query=Cai%2C+F">Fucheng Cai</a>, <a href="/search/cs?searchtype=author&amp;query=Mi%2C+Y">Yapeng Mi</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+Z">Zhe Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Zuo%2C+W">Wangmeng Zuo</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+X">Xingtao Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Fan%2C+X">Xiaopeng Fan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.13711v2-abstract-short" style="display: inline;"> Text-driven 3D scene generation has seen significant advancements recently. However, most existing methods generate single-view images using generative models and then stitch them together in 3D space. This independent generation for each view often results in spatial inconsistency and implausibility in the 3D scenes. To address this challenge, we proposed a novel text-driven 3D-consistent scene g&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.13711v2-abstract-full').style.display = 'inline'; document.getElementById('2408.13711v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.13711v2-abstract-full" style="display: none;"> Text-driven 3D scene generation has seen significant advancements recently. However, most existing methods generate single-view images using generative models and then stitch them together in 3D space. This independent generation for each view often results in spatial inconsistency and implausibility in the 3D scenes. To address this challenge, we proposed a novel text-driven 3D-consistent scene generation model: SceneDreamer360. Our proposed method leverages a text-driven panoramic image generation model as a prior for 3D scene generation and employs 3D Gaussian Splatting (3DGS) to ensure consistency across multi-view panoramic images. Specifically, SceneDreamer360 enhances the fine-tuned Panfusion generator with a three-stage panoramic enhancement, enabling the generation of high-resolution, detail-rich panoramic images. During the 3D scene construction, a novel point cloud fusion initialization method is used, producing higher quality and spatially consistent point clouds. Our extensive experiments demonstrate that compared to other methods, SceneDreamer360 with its panoramic image generation and 3DGS can produce higher quality, spatially consistent, and visually appealing 3D scenes from any text prompt. Our codes are available at \url{https://github.com/liwrui/SceneDreamer360}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.13711v2-abstract-full').style.display = 'none'; document.getElementById('2408.13711v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.07540">arXiv:2408.07540</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2408.07540">pdf</a>, <a href="https://arxiv.org/format/2408.07540">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> 3D Gaussian Editing with A Single Image </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Luo%2C+G">Guan Luo</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+T">Tian-Xing Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Y">Ying-Tian Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Fan%2C+X">Xiao-Xiong Fan</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+F">Fang-Lue Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+S">Song-Hai Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.07540v1-abstract-short" style="display: inline;"> The modeling and manipulation of 3D scenes captured from the real world are pivotal in various applications, attracting growing research interest. While previous works on editing have achieved interesting results through manipulating 3D meshes, they often require accurately reconstructed meshes to perform editing, which limits their application in 3D content generation. To address this gap, we int&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.07540v1-abstract-full').style.display = 'inline'; document.getElementById('2408.07540v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.07540v1-abstract-full" style="display: none;"> The modeling and manipulation of 3D scenes captured from the real world are pivotal in various applications, attracting growing research interest. While previous works on editing have achieved interesting results through manipulating 3D meshes, they often require accurately reconstructed meshes to perform editing, which limits their application in 3D content generation. To address this gap, we introduce a novel single-image-driven 3D scene editing approach based on 3D Gaussian Splatting, enabling intuitive manipulation via directly editing the content on a 2D image plane. Our method learns to optimize the 3D Gaussians to align with an edited version of the image rendered from a user-specified viewpoint of the original scene. To capture long-range object deformation, we introduce positional loss into the optimization process of 3D Gaussian Splatting and enable gradient propagation through reparameterization. To handle occluded 3D Gaussians when rendering from the specified viewpoint, we build an anchor-based structure and employ a coarse-to-fine optimization strategy capable of handling long-range deformation while maintaining structural stability. Furthermore, we design a novel masking strategy to adaptively identify non-rigid deformation regions for fine-scale modeling. Extensive experiments show the effectiveness of our method in handling geometric details, long-range, and non-rigid deformation, demonstrating superior editing flexibility and quality compared to previous approaches. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.07540v1-abstract-full').style.display = 'none'; document.getElementById('2408.07540v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 12 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.07467">arXiv:2408.07467</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2408.07467">pdf</a>, <a href="https://arxiv.org/format/2408.07467">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Domain-invariant Representation Learning via Segment Anything Model for Blood Cell Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Yongcheng Li</a>, <a href="/search/cs?searchtype=author&amp;query=Cai%2C+L">Lingcong Cai</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+Y">Ying Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+C">Cheng Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Yupeng Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+J">Jingyan Jiang</a>, <a href="/search/cs?searchtype=author&amp;query=Dai%2C+G">Genan Dai</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+B">Bowen Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Cao%2C+J">Jingzhou Cao</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+X">Xiangzhong Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Fan%2C+X">Xiaomao Fan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.07467v1-abstract-short" style="display: inline;"> Accurate classification of blood cells is of vital significance in the diagnosis of hematological disorders. However, in real-world scenarios, domain shifts caused by the variability in laboratory procedures and settings, result in a rapid deterioration of the model&#39;s generalization performance. To address this issue, we propose a novel framework of domain-invariant representation learning (DoRL)&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.07467v1-abstract-full').style.display = 'inline'; document.getElementById('2408.07467v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.07467v1-abstract-full" style="display: none;"> Accurate classification of blood cells is of vital significance in the diagnosis of hematological disorders. However, in real-world scenarios, domain shifts caused by the variability in laboratory procedures and settings, result in a rapid deterioration of the model&#39;s generalization performance. To address this issue, we propose a novel framework of domain-invariant representation learning (DoRL) via segment anything model (SAM) for blood cell classification. The DoRL comprises two main components: a LoRA-based SAM (LoRA-SAM) and a cross-domain autoencoder (CAE). The advantage of DoRL is that it can extract domain-invariant representations from various blood cell datasets in an unsupervised manner. Specifically, we first leverage the large-scale foundation model of SAM, fine-tuned with LoRA, to learn general image embeddings and segment blood cells. Additionally, we introduce CAE to learn domain-invariant representations across different-domain datasets while mitigating images&#39; artifacts. To validate the effectiveness of domain-invariant representations, we employ five widely used machine learning classifiers to construct blood cell classification models. Experimental results on two public blood cell datasets and a private real dataset demonstrate that our proposed DoRL achieves a new state-of-the-art cross-domain performance, surpassing existing methods by a significant margin. The source code can be available at the URL (https://github.com/AnoK3111/DoRL). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.07467v1-abstract-full').style.display = 'none'; document.getElementById('2408.07467v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.06716">arXiv:2408.06716</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2408.06716">pdf</a>, <a href="https://arxiv.org/format/2408.06716">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Towards Cross-Domain Single Blood Cell Image Classification via Large-Scale LoRA-based Segment Anything Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Yongcheng Li</a>, <a href="/search/cs?searchtype=author&amp;query=Cai%2C+L">Lingcong Cai</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+Y">Ying Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Yupeng Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+J">Jingyan Jiang</a>, <a href="/search/cs?searchtype=author&amp;query=Dai%2C+G">Genan Dai</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+B">Bowen Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Cao%2C+J">Jingzhou Cao</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+X">Xiangzhong Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Fan%2C+X">Xiaomao Fan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.06716v1-abstract-short" style="display: inline;"> Accurate classification of blood cells plays a vital role in hematological analysis as it aids physicians in diagnosing various medical conditions. In this study, we present a novel approach for classifying blood cell images known as BC-SAM. BC-SAM leverages the large-scale foundation model of Segment Anything Model (SAM) and incorporates a fine-tuning technique using LoRA, allowing it to extract&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.06716v1-abstract-full').style.display = 'inline'; document.getElementById('2408.06716v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.06716v1-abstract-full" style="display: none;"> Accurate classification of blood cells plays a vital role in hematological analysis as it aids physicians in diagnosing various medical conditions. In this study, we present a novel approach for classifying blood cell images known as BC-SAM. BC-SAM leverages the large-scale foundation model of Segment Anything Model (SAM) and incorporates a fine-tuning technique using LoRA, allowing it to extract general image embeddings from blood cell images. To enhance the applicability of BC-SAM across different blood cell image datasets, we introduce an unsupervised cross-domain autoencoder that focuses on learning intrinsic features while suppressing artifacts in the images. To assess the performance of BC-SAM, we employ four widely used machine learning classifiers (Random Forest, Support Vector Machine, Artificial Neural Network, and XGBoost) to construct blood cell classification models and compare them against existing state-of-the-art methods. Experimental results conducted on two publicly available blood cell datasets (Matek-19 and Acevedo-20) demonstrate that our proposed BC-SAM achieves a new state-of-the-art result, surpassing the baseline methods with a significant improvement. The source code of this paper is available at https://github.com/AnoK3111/BC-SAM. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.06716v1-abstract-full').style.display = 'none'; document.getElementById('2408.06716v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.05426">arXiv:2408.05426</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2408.05426">pdf</a>, <a href="https://arxiv.org/format/2408.05426">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SAM-FNet: SAM-Guided Fusion Network for Laryngo-Pharyngeal Tumor Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wei%2C+J">Jia Wei</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Yun Li</a>, <a href="/search/cs?searchtype=author&amp;query=Qiu%2C+M">Meiyu Qiu</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+H">Hongyu Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Fan%2C+X">Xiaomao Fan</a>, <a href="/search/cs?searchtype=author&amp;query=Lei%2C+W">Wenbin Lei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.05426v2-abstract-short" style="display: inline;"> Laryngo-pharyngeal cancer (LPC) is a highly fatal malignant disease affecting the head and neck region. Previous studies on endoscopic tumor detection, particularly those leveraging dual-branch network architectures, have shown significant advancements in tumor detection. These studies highlight the potential of dual-branch networks in improving diagnostic accuracy by effectively integrating globa&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.05426v2-abstract-full').style.display = 'inline'; document.getElementById('2408.05426v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.05426v2-abstract-full" style="display: none;"> Laryngo-pharyngeal cancer (LPC) is a highly fatal malignant disease affecting the head and neck region. Previous studies on endoscopic tumor detection, particularly those leveraging dual-branch network architectures, have shown significant advancements in tumor detection. These studies highlight the potential of dual-branch networks in improving diagnostic accuracy by effectively integrating global and local (lesion) feature extraction. However, they are still limited in their capabilities to accurately locate the lesion region and capture the discriminative feature information between the global and local branches. To address these issues, we propose a novel SAM-guided fusion network (SAM-FNet), a dual-branch network for laryngo-pharyngeal tumor detection. By leveraging the powerful object segmentation capabilities of the Segment Anything Model (SAM), we introduce the SAM into the SAM-FNet to accurately segment the lesion region. Furthermore, we propose a GAN-like feature optimization (GFO) module to capture the discriminative features between the global and local branches, enhancing the fusion feature complementarity. Additionally, we collect two LPC datasets from the First Affiliated Hospital (FAHSYSU) and the Sixth Affiliated Hospital (SAHSYSU) of Sun Yat-sen University. The FAHSYSU dataset is used as the internal dataset for training the model, while the SAHSYSU dataset is used as the external dataset for evaluating the model&#39;s performance. Extensive experiments on both datasets of FAHSYSU and SAHSYSU demonstrate that the SAM-FNet can achieve competitive results, outperforming the state-of-the-art counterparts. The source code of SAM-FNet is available at the URL of https://github.com/VVJia/SAM-FNet. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.05426v2-abstract-full').style.display = 'none'; document.getElementById('2408.05426v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.04585">arXiv:2408.04585</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2408.04585">pdf</a>, <a href="https://arxiv.org/format/2408.04585">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Towards Resilient and Efficient LLMs: A Comparative Study of Efficiency, Performance, and Adversarial Robustness </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Fan%2C+X">Xiaojing Fan</a>, <a href="/search/cs?searchtype=author&amp;query=Tao%2C+C">Chunliang Tao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.04585v3-abstract-short" style="display: inline;"> With the increasing demand for practical applications of Large Language Models (LLMs), many attention-efficient models have been developed to balance performance and computational cost. However, the adversarial robustness of these models remains under-explored. In this work, we design a framework to investigate the trade-off between efficiency, performance, and adversarial robustness of LLMs and c&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.04585v3-abstract-full').style.display = 'inline'; document.getElementById('2408.04585v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.04585v3-abstract-full" style="display: none;"> With the increasing demand for practical applications of Large Language Models (LLMs), many attention-efficient models have been developed to balance performance and computational cost. However, the adversarial robustness of these models remains under-explored. In this work, we design a framework to investigate the trade-off between efficiency, performance, and adversarial robustness of LLMs and conduct extensive experiments on three prominent models with varying levels of complexity and efficiency -- Transformer++, Gated Linear Attention (GLA) Transformer, and MatMul-Free LM -- utilizing the GLUE and AdvGLUE datasets. The AdvGLUE dataset extends the GLUE dataset with adversarial samples designed to challenge model robustness. Our results show that while the GLA Transformer and MatMul-Free LM achieve slightly lower accuracy on GLUE tasks, they demonstrate higher efficiency and either superior or comparative robustness on AdvGLUE tasks compared to Transformer++ across different attack levels. These findings highlight the potential of simplified architectures to achieve a compelling balance between efficiency, performance, and adversarial robustness, offering valuable insights for applications where resource constraints and resilience to adversarial attacks are critical. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.04585v3-abstract-full').style.display = 'none'; document.getElementById('2408.04585v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.01950">arXiv:2408.01950</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2408.01950">pdf</a>, <a href="https://arxiv.org/format/2408.01950">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Why Perturbing Symbolic Music is Necessary: Fitting the Distribution of Never-used Notes through a Joint Probabilistic Diffusion Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liu%2C+S">Shipei Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Fan%2C+X">Xiaoya Fan</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+G">Guowei Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.01950v1-abstract-short" style="display: inline;"> Existing music generation models are mostly language-based, neglecting the frequency continuity property of notes, resulting in inadequate fitting of rare or never-used notes and thus reducing the diversity of generated samples. We argue that the distribution of notes can be modeled by translational invariance and periodicity, especially using diffusion models to generalize notes by injecting freq&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.01950v1-abstract-full').style.display = 'inline'; document.getElementById('2408.01950v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.01950v1-abstract-full" style="display: none;"> Existing music generation models are mostly language-based, neglecting the frequency continuity property of notes, resulting in inadequate fitting of rare or never-used notes and thus reducing the diversity of generated samples. We argue that the distribution of notes can be modeled by translational invariance and periodicity, especially using diffusion models to generalize notes by injecting frequency-domain Gaussian noise. However, due to the low-density nature of music symbols, estimating the distribution of notes latent in the high-density solution space poses significant challenges. To address this problem, we introduce the Music-Diff architecture, which fits a joint distribution of notes and accompanying semantic information to generate symbolic music conditionally. We first enhance the fragmentation module for extracting semantics by using event-based notations and the structural similarity index, thereby preventing boundary blurring. As a prerequisite for multivariate perturbation, we introduce a joint pre-training method to construct the progressions between notes and musical semantics while avoiding direct modeling of low-density notes. Finally, we recover the perturbed notes by a multi-branch denoiser that fits multiple noise objectives via Pareto optimization. Our experiments suggest that in contrast to language models, joint probability diffusion models perturbing at both note and semantic levels can provide more sample diversity and compositional regularity. The case study highlights the rhythmic advantages of our model over language- and DDPMs-based models by analyzing the hierarchical structure expressed in the self-similarity metrics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.01950v1-abstract-full').style.display = 'none'; document.getElementById('2408.01950v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.00762">arXiv:2408.00762</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2408.00762">pdf</a>, <a href="https://arxiv.org/format/2408.00762">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> UniTalker: Scaling up Audio-Driven 3D Facial Animation through A Unified Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Fan%2C+X">Xiangyu Fan</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+J">Jiaqi Li</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+Z">Zhiqian Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Xiao%2C+W">Weiye Xiao</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+L">Lei Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.00762v1-abstract-short" style="display: inline;"> Audio-driven 3D facial animation aims to map input audio to realistic facial motion. Despite significant progress, limitations arise from inconsistent 3D annotations, restricting previous models to training on specific annotations and thereby constraining the training scale. In this work, we present UniTalker, a unified model featuring a multi-head architecture designed to effectively leverage dat&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.00762v1-abstract-full').style.display = 'inline'; document.getElementById('2408.00762v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.00762v1-abstract-full" style="display: none;"> Audio-driven 3D facial animation aims to map input audio to realistic facial motion. Despite significant progress, limitations arise from inconsistent 3D annotations, restricting previous models to training on specific annotations and thereby constraining the training scale. In this work, we present UniTalker, a unified model featuring a multi-head architecture designed to effectively leverage datasets with varied annotations. To enhance training stability and ensure consistency among multi-head outputs, we employ three training strategies, namely, PCA, model warm-up, and pivot identity embedding. To expand the training scale and diversity, we assemble A2F-Bench, comprising five publicly available datasets and three newly curated datasets. These datasets contain a wide range of audio domains, covering multilingual speech voices and songs, thereby scaling the training data from commonly employed datasets, typically less than 1 hour, to 18.5 hours. With a single trained UniTalker model, we achieve substantial lip vertex error reductions of 9.2% for BIWI dataset and 13.7% for Vocaset. Additionally, the pre-trained UniTalker exhibits promise as the foundation model for audio-driven facial animation tasks. Fine-tuning the pre-trained UniTalker on seen datasets further enhances performance on each dataset, with an average error reduction of 6.3% on A2F-Bench. Moreover, fine-tuning UniTalker on an unseen dataset with only half the data surpasses prior state-of-the-art models trained on the full dataset. The code and dataset are available at the project page https://github.com/X-niper/UniTalker. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.00762v1-abstract-full').style.display = 'none'; document.getElementById('2408.00762v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.19893">arXiv:2407.19893</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.19893">pdf</a>, <a href="https://arxiv.org/format/2407.19893">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Leveraging Foundation Models for Zero-Shot IoT Sensing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Xue%2C+D">Dinghao Xue</a>, <a href="/search/cs?searchtype=author&amp;query=Fan%2C+X">Xiaoran Fan</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+T">Tao Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Lan%2C+G">Guohao Lan</a>, <a href="/search/cs?searchtype=author&amp;query=Song%2C+Q">Qun Song</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.19893v1-abstract-short" style="display: inline;"> Deep learning models are increasingly deployed on edge Internet of Things (IoT) devices. However, these models typically operate under supervised conditions and fail to recognize unseen classes different from training. To address this, zero-shot learning (ZSL) aims to classify data of unseen classes with the help of semantic information. Foundation models (FMs) trained on web-scale data have shown&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.19893v1-abstract-full').style.display = 'inline'; document.getElementById('2407.19893v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.19893v1-abstract-full" style="display: none;"> Deep learning models are increasingly deployed on edge Internet of Things (IoT) devices. However, these models typically operate under supervised conditions and fail to recognize unseen classes different from training. To address this, zero-shot learning (ZSL) aims to classify data of unseen classes with the help of semantic information. Foundation models (FMs) trained on web-scale data have shown impressive ZSL capability in natural language processing and visual understanding. However, leveraging FMs&#39; generalized knowledge for zero-shot IoT sensing using signals such as mmWave, IMU, and Wi-Fi has not been fully investigated. In this work, we align the IoT data embeddings with the semantic embeddings generated by an FM&#39;s text encoder for zero-shot IoT sensing. To utilize the physics principles governing the generation of IoT sensor signals to derive more effective prompts for semantic embedding extraction, we propose to use cross-attention to combine a learnable soft prompt that is optimized automatically on training data and an auxiliary hard prompt that encodes domain knowledge of the IoT sensing task. To address the problem of IoT embeddings biasing to seen classes due to the lack of unseen class data during training, we propose using data augmentation to synthesize unseen class IoT data for fine-tuning the IoT feature extractor and embedding projector. We evaluate our approach on multiple IoT sensing tasks. Results show that our approach achieves superior open-set detection and generalized zero-shot learning performance compared with various baselines. Our code is available at https://github.com/schrodingho/FM\_ZSL\_IoT. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.19893v1-abstract-full').style.display = 'none'; document.getElementById('2407.19893v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.19491">arXiv:2407.19491</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.19491">pdf</a>, <a href="https://arxiv.org/format/2407.19491">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Multi-modal Crowd Counting via Modal Emulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+C">Chenhao Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Hong%2C+X">Xiaopeng Hong</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+Z">Zhiheng Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Wei%2C+Y">Yupeng Wei</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yabin Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Fan%2C+X">Xiaopeng Fan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.19491v1-abstract-short" style="display: inline;"> Multi-modal crowd counting is a crucial task that uses multi-modal cues to estimate the number of people in crowded scenes. To overcome the gap between different modalities, we propose a modal emulation-based two-pass multi-modal crowd-counting framework that enables efficient modal emulation, alignment, and fusion. The framework consists of two key components: a \emph{multi-modal inference} pass&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.19491v1-abstract-full').style.display = 'inline'; document.getElementById('2407.19491v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.19491v1-abstract-full" style="display: none;"> Multi-modal crowd counting is a crucial task that uses multi-modal cues to estimate the number of people in crowded scenes. To overcome the gap between different modalities, we propose a modal emulation-based two-pass multi-modal crowd-counting framework that enables efficient modal emulation, alignment, and fusion. The framework consists of two key components: a \emph{multi-modal inference} pass and a \emph{cross-modal emulation} pass. The former utilizes a hybrid cross-modal attention module to extract global and local information and achieve efficient multi-modal fusion. The latter uses attention prompting to coordinate different modalities and enhance multi-modal alignment. We also introduce a modality alignment module that uses an efficient modal consistency loss to align the outputs of the two passes and bridge the semantic gap between modalities. Extensive experiments on both RGB-Thermal and RGB-Depth counting datasets demonstrate its superior performance compared to previous methods. Code available at https://github.com/Mr-Monday/Multi-modal-Crowd-Counting-via-Modal-Emulation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.19491v1-abstract-full').style.display = 'none'; document.getElementById('2407.19491v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This is the preprint version of the paper to appear in BMVC 2024. Please cite the final published version. Code is available at https://github.com/Mr-Monday/Multi-modal-Crowd-Counting-via-Modal-Emulation</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.16161">arXiv:2407.16161</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.16161">pdf</a>, <a href="https://arxiv.org/format/2407.16161">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> TransFeat-TPP: An Interpretable Deep Covariate Temporal Point Processes </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Meng%2C+Z">Zizhuo Meng</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+B">Boyu Li</a>, <a href="/search/cs?searchtype=author&amp;query=Fan%2C+X">Xuhui Fan</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Z">Zhidong Li</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yang Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+F">Fang Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+F">Feng Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.16161v1-abstract-short" style="display: inline;"> The classical temporal point process (TPP) constructs an intensity function by taking the occurrence times into account. Nevertheless, occurrence time may not be the only relevant factor, other contextual data, termed covariates, may also impact the event evolution. Incorporating such covariates into the model is beneficial, while distinguishing their relevance to the event dynamics is of great pr&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.16161v1-abstract-full').style.display = 'inline'; document.getElementById('2407.16161v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.16161v1-abstract-full" style="display: none;"> The classical temporal point process (TPP) constructs an intensity function by taking the occurrence times into account. Nevertheless, occurrence time may not be the only relevant factor, other contextual data, termed covariates, may also impact the event evolution. Incorporating such covariates into the model is beneficial, while distinguishing their relevance to the event dynamics is of great practical significance. In this work, we propose a Transformer-based covariate temporal point process (TransFeat-TPP) model to improve the interpretability of deep covariate-TPPs while maintaining powerful expressiveness. TransFeat-TPP can effectively model complex relationships between events and covariates, and provide enhanced interpretability by discerning the importance of various covariates. Experimental results on synthetic and real datasets demonstrate improved prediction accuracy and consistently interpretable feature importance when compared to existing deep covariate-TPPs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.16161v1-abstract-full').style.display = 'none'; document.getElementById('2407.16161v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.15867">arXiv:2407.15867</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.15867">pdf</a>, <a href="https://arxiv.org/format/2407.15867">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Digital Libraries">cs.DL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> </div> </div> <p class="title is-5 mathjax"> The new science of COVID-19: A Bibliographic and Network Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Fan%2C+X">Xuezhou Fan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.15867v1-abstract-short" style="display: inline;"> Since the outbreak of the COVID-19, there have been many scientific publications studying the COVID-19. The purpose of this study is to identify the research trend, collaboration pattern, most influential elements, etc. from scientific publications related to COVID-19 in 2020, by using bibliographic analysis and network analysis. In Chapter 1 and Chapter 2, motivation behind this paper is introduc&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.15867v1-abstract-full').style.display = 'inline'; document.getElementById('2407.15867v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.15867v1-abstract-full" style="display: none;"> Since the outbreak of the COVID-19, there have been many scientific publications studying the COVID-19. The purpose of this study is to identify the research trend, collaboration pattern, most influential elements, etc. from scientific publications related to COVID-19 in 2020, by using bibliographic analysis and network analysis. In Chapter 1 and Chapter 2, motivation behind this paper is introduced. Some previous similar studies are discussed. Comparisons are made in different aspects, such as data collection methods, data analysis tools and methods, etc. Their advantages and limitations compared to this paper are also addressed. In Chapter 3, important concepts used in this paper related to bibliographic analysis such as h-index and g-index, and network analysis such as centrality measures and assortativity are introduced. Networks with small-world property and scale-free property will also be studied. In Chapter 4 and Chapter 5, the way the data are obtained for the analysis of this paper is introduced step by step. Full result is shown. In Chapter 6, conclusions are arrived. A general growing trend of the number of the publications is observed, due to the efforts made by scientific researchers. Meanwhile, measures should also be taken to encourage future study in this field. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.15867v1-abstract-full').style.display = 'none'; document.getElementById('2407.15867v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.08130">arXiv:2407.08130</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.08130">pdf</a>, <a href="https://arxiv.org/format/2407.08130">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Spiking Tucker Fusion Transformer for Audio-Visual Zero-Shot Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+W">Wenrui Li</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+P">Penghong Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Xiong%2C+R">Ruiqin Xiong</a>, <a href="/search/cs?searchtype=author&amp;query=Fan%2C+X">Xiaopeng Fan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.08130v1-abstract-short" style="display: inline;"> The spiking neural networks (SNNs) that efficiently encode temporal sequences have shown great potential in extracting audio-visual joint feature representations. However, coupling SNNs (binary spike sequences) with transformers (float-point sequences) to jointly explore the temporal-semantic information still facing challenges. In this paper, we introduce a novel Spiking Tucker Fusion Transformer&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.08130v1-abstract-full').style.display = 'inline'; document.getElementById('2407.08130v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.08130v1-abstract-full" style="display: none;"> The spiking neural networks (SNNs) that efficiently encode temporal sequences have shown great potential in extracting audio-visual joint feature representations. However, coupling SNNs (binary spike sequences) with transformers (float-point sequences) to jointly explore the temporal-semantic information still facing challenges. In this paper, we introduce a novel Spiking Tucker Fusion Transformer (STFT) for audio-visual zero-shot learning (ZSL). The STFT leverage the temporal and semantic information from different time steps to generate robust representations. The time-step factor (TSF) is introduced to dynamically synthesis the subsequent inference information. To guide the formation of input membrane potentials and reduce the spike noise, we propose a global-local pooling (GLP) which combines the max and average pooling operations. Furthermore, the thresholds of the spiking neurons are dynamically adjusted based on semantic and temporal cues. Integrating the temporal and semantic information extracted by SNNs and Transformers are difficult due to the increased number of parameters in a straightforward bilinear model. To address this, we introduce a temporal-semantic Tucker fusion module, which achieves multi-scale fusion of SNN and Transformer outputs while maintaining full second-order interactions. Our experimental results demonstrate the effectiveness of the proposed approach in achieving state-of-the-art performance in three benchmark datasets. The harmonic mean (HM) improvement of VGGSound, UCF101 and ActivityNet are around 15.4\%, 3.9\%, and 14.9\%, respectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.08130v1-abstract-full').style.display = 'none'; document.getElementById('2407.08130v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by TIP</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.07311">arXiv:2407.07311</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.07311">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ViTime: A Visual Intelligence-Based Foundation Model for Time Series Forecasting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yang%2C+L">Luoxiao Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yun Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Fan%2C+X">Xinqi Fan</a>, <a href="/search/cs?searchtype=author&amp;query=Cohen%2C+I">Israel Cohen</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+J">Jingdong Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+Y">Yue Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Z">Zijun Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.07311v2-abstract-short" style="display: inline;"> The success of large pretrained models in natural language processing (NLP) and computer vision (CV) has opened new avenues for constructing foundation models for time series forecasting (TSF). Traditional TSF foundation models rely heavily on numerical data fitting. In contrast, the human brain is inherently skilled at processing visual information, prefer predicting future trends by observing vi&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.07311v2-abstract-full').style.display = 'inline'; document.getElementById('2407.07311v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.07311v2-abstract-full" style="display: none;"> The success of large pretrained models in natural language processing (NLP) and computer vision (CV) has opened new avenues for constructing foundation models for time series forecasting (TSF). Traditional TSF foundation models rely heavily on numerical data fitting. In contrast, the human brain is inherently skilled at processing visual information, prefer predicting future trends by observing visualized sequences. From a biomimetic perspective, utilizing models to directly process numerical sequences might not be the most effective route to achieving Artificial General Intelligence (AGI). This paper proposes ViTime, a novel Visual Intelligence-based foundation model for TSF. ViTime overcomes the limitations of numerical time series data fitting by utilizing visual data processing paradigms and employs a innovative data synthesis method during training, called Real Time Series (RealTS). Experiments on a diverse set of previously unseen forecasting datasets demonstrate that ViTime achieves state-of-the-art zero-shot performance, even surpassing the best individually trained supervised models in some situations. These findings suggest that visual intelligence can significantly enhance time series analysis and forecasting, paving the way for more advanced and versatile models in the field. The code for our framework is accessible at https://github.com/IkeYang/ViTime. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.07311v2-abstract-full').style.display = 'none'; document.getElementById('2407.07311v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.01886">arXiv:2407.01886</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.01886">pdf</a>, <a href="https://arxiv.org/format/2407.01886">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Core Knowledge Learning Framework for Graph Adaptation and Scalability Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+B">Bowen Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+Z">Zhichao Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Dai%2C+G">Genan Dai</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+G">Guangning Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Fan%2C+X">Xiaomao Fan</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+H">Hu Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.01886v1-abstract-short" style="display: inline;"> Graph classification is a pivotal challenge in machine learning, especially within the realm of graph-based data, given its importance in numerous real-world applications such as social network analysis, recommendation systems, and bioinformatics. Despite its significance, graph classification faces several hurdles, including adapting to diverse prediction tasks, training across multiple target do&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.01886v1-abstract-full').style.display = 'inline'; document.getElementById('2407.01886v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.01886v1-abstract-full" style="display: none;"> Graph classification is a pivotal challenge in machine learning, especially within the realm of graph-based data, given its importance in numerous real-world applications such as social network analysis, recommendation systems, and bioinformatics. Despite its significance, graph classification faces several hurdles, including adapting to diverse prediction tasks, training across multiple target domains, and handling small-sample prediction scenarios. Current methods often tackle these challenges individually, leading to fragmented solutions that lack a holistic approach to the overarching problem. In this paper, we propose an algorithm aimed at addressing the aforementioned challenges. By incorporating insights from various types of tasks, our method aims to enhance adaptability, scalability, and generalizability in graph classification. Motivated by the recognition that the underlying subgraph plays a crucial role in GNN prediction, while the remainder is task-irrelevant, we introduce the Core Knowledge Learning (\method{}) framework for graph adaptation and scalability learning. \method{} comprises several key modules, including the core subgraph knowledge submodule, graph domain adaptation module, and few-shot learning module for downstream tasks. Each module is tailored to tackle specific challenges in graph classification, such as domain shift, label inconsistencies, and data scarcity. By learning the core subgraph of the entire graph, we focus on the most pertinent features for task relevance. Consequently, our method offers benefits such as improved model performance, increased domain adaptability, and enhanced robustness to domain variations. Experimental results demonstrate significant performance enhancements achieved by our method compared to state-of-the-art approaches. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.01886v1-abstract-full').style.display = 'none'; document.getElementById('2407.01886v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.19042">arXiv:2406.19042</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.19042">pdf</a>, <a href="https://arxiv.org/format/2406.19042">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> Towards Credential-based Device Registration in DApps for DePINs with ZKPs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Heiss%2C+J">Jonathan Heiss</a>, <a href="/search/cs?searchtype=author&amp;query=Castillo%2C+F">Fernando Castillo</a>, <a href="/search/cs?searchtype=author&amp;query=Fan%2C+X">Xinxin Fan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.19042v1-abstract-short" style="display: inline;"> Decentralized Physical Infrastructure Networks (DePINS) are secured and governed by blockchains but beyond crypto-economic incentives, they lack measures to establish trust in participating devices and their services. The verification of relevant device credentials during device registration helps to overcome this problem. However, on-chain verification in decentralized applications (dApp) disclos&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.19042v1-abstract-full').style.display = 'inline'; document.getElementById('2406.19042v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.19042v1-abstract-full" style="display: none;"> Decentralized Physical Infrastructure Networks (DePINS) are secured and governed by blockchains but beyond crypto-economic incentives, they lack measures to establish trust in participating devices and their services. The verification of relevant device credentials during device registration helps to overcome this problem. However, on-chain verification in decentralized applications (dApp) discloses potentially confidential device attributes whereas off-chain verification introduces undesirable trust assumptions. In this paper, we propose a credential-based device registration (CDR) mechanism that verifies device credentials on the blockchain and leverages zero-knowledge proofs (ZKP) to protect confidential device attributes from being disclosed. We characterize CDR for DePINs, present a general system model, and technically evaluate CDR using zkSNARKs with Groth16 and Marlin. Our experiments give first insights into performance impacts and reveal a tradeoff between the applied proof systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.19042v1-abstract-full').style.display = 'none'; document.getElementById('2406.19042v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.02064">arXiv:2406.02064</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.02064">pdf</a>, <a href="https://arxiv.org/format/2406.02064">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Advancing Generalized Transfer Attack with Initialization Derived Bilevel Optimization and Dynamic Sequence Truncation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Y">Yaohua Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Gao%2C+J">Jiaxin Gao</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+X">Xuan Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Jiao%2C+X">Xianghao Jiao</a>, <a href="/search/cs?searchtype=author&amp;query=Fan%2C+X">Xin Fan</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+R">Risheng Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.02064v1-abstract-short" style="display: inline;"> Transfer attacks generate significant interest for real-world black-box applications by crafting transferable adversarial examples through surrogate models. Whereas, existing works essentially directly optimize the single-level objective w.r.t. the surrogate model, which always leads to poor interpretability of attack mechanism and limited generalization performance over unknown victim models. In&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.02064v1-abstract-full').style.display = 'inline'; document.getElementById('2406.02064v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.02064v1-abstract-full" style="display: none;"> Transfer attacks generate significant interest for real-world black-box applications by crafting transferable adversarial examples through surrogate models. Whereas, existing works essentially directly optimize the single-level objective w.r.t. the surrogate model, which always leads to poor interpretability of attack mechanism and limited generalization performance over unknown victim models. In this work, we propose the \textbf{B}il\textbf{E}vel \textbf{T}ransfer \textbf{A}ttac\textbf{K} (BETAK) framework by establishing an initialization derived bilevel optimization paradigm, which explicitly reformulates the nested constraint relationship between the Upper-Level (UL) pseudo-victim attacker and the Lower-Level (LL) surrogate attacker. Algorithmically, we introduce the Hyper Gradient Response (HGR) estimation as an effective feedback for the transferability over pseudo-victim attackers, and propose the Dynamic Sequence Truncation (DST) technique to dynamically adjust the back-propagation path for HGR and reduce computational overhead simultaneously. Meanwhile, we conduct detailed algorithmic analysis and provide convergence guarantee to support non-convexity of the LL surrogate attacker. Extensive evaluations demonstrate substantial improvement of BETAK (e.g., $\mathbf{53.41}$\% increase of attack success rates against IncRes-v$2_{ens}$) against different victims and defense methods in targeted and untargeted attack scenarios. The source code is available at https://github.com/callous-youth/BETAK. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.02064v1-abstract-full').style.display = 'none'; document.getElementById('2406.02064v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by IJCAI 2024. 10 pages</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Fan%2C+X&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Fan%2C+X&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Fan%2C+X&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Fan%2C+X&amp;start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Fan%2C+X&amp;start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Fan%2C+X&amp;start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">&hellip;</span></li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

Pages: 1 2 3 4 5 6 7 8 9 10