Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 553 results for author: <span class="mathjax">Tan, C</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Tan%2C+C">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Tan, C"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Tan%2C+C&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Tan, C"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Tan%2C+C&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Tan%2C+C&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Tan%2C+C&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Tan%2C+C&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Tan%2C+C&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Tan%2C+C&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.18429">arXiv:2411.18429</a> <span> [<a href="https://arxiv.org/pdf/2411.18429">pdf</a>, <a href="https://arxiv.org/format/2411.18429">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> An AI-Assisted Multi-Agent Dual Dialogue System to Support Mental Health Care Providers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kampman%2C+O+P">Onno P. Kampman</a>, <a href="/search/cs?searchtype=author&query=Phang%2C+Y+S">Ye Sheng Phang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+S">Stanley Han</a>, <a href="/search/cs?searchtype=author&query=Xing%2C+M">Michael Xing</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+X">Xinyi Hong</a>, <a href="/search/cs?searchtype=author&query=Hoosainsah%2C+H">Hazirah Hoosainsah</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+C">Caleb Tan</a>, <a href="/search/cs?searchtype=author&query=Winata%2C+G+I">Genta Indra Winata</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Skyler Wang</a>, <a href="/search/cs?searchtype=author&query=Heaukulani%2C+C">Creighton Heaukulani</a>, <a href="/search/cs?searchtype=author&query=Weng%2C+J+H">Janice Huiqin Weng</a>, <a href="/search/cs?searchtype=author&query=Morris%2C+R+J">Robert JT Morris</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.18429v1-abstract-short" style="display: inline;"> We introduce a general-purpose, human-in-the-loop dual dialogue system to support mental health care professionals. The system, co-designed with care providers, is conceptualized to assist them in interacting with care seekers rather than functioning as a fully automated dialogue system solution. The AI assistant within the system reduces the cognitive load of mental health care providers by propo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18429v1-abstract-full').style.display = 'inline'; document.getElementById('2411.18429v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.18429v1-abstract-full" style="display: none;"> We introduce a general-purpose, human-in-the-loop dual dialogue system to support mental health care professionals. The system, co-designed with care providers, is conceptualized to assist them in interacting with care seekers rather than functioning as a fully automated dialogue system solution. The AI assistant within the system reduces the cognitive load of mental health care providers by proposing responses, analyzing conversations to extract pertinent themes, summarizing dialogues, and recommending localized relevant content and internet-based cognitive behavioral therapy exercises. These functionalities are achieved through a multi-agent system design, where each specialized, supportive agent is characterized by a large language model. In evaluating the multi-agent system, we focused specifically on the proposal of responses to emotionally distressed care seekers. We found that the proposed responses matched a reasonable human quality in demonstrating empathy, showing its appropriateness for augmenting the work of mental health care providers. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18429v1-abstract-full').style.display = 'none'; document.getElementById('2411.18429v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.18314">arXiv:2411.18314</a> <span> [<a href="https://arxiv.org/pdf/2411.18314">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Real-time Video Target Tracking Algorithm Utilizing Convolutional Neural Networks (CNN) </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tan%2C+C">Chaoyi Tan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiangtian Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiaobo Wang</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+Z">Zhen Qi</a>, <a href="/search/cs?searchtype=author&query=Xiang%2C+A">Ao Xiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.18314v1-abstract-short" style="display: inline;"> Thispaperaimstoresearchandimplementa real-timevideotargettrackingalgorithmbasedon ConvolutionalNeuralNetworks(CNN),enhancingthe accuracyandrobustnessoftargettrackingincomplex scenarios.Addressingthelimitationsoftraditionaltracking algorithmsinhandlingissuessuchastargetocclusion,morphologicalchanges,andbackgroundinterference,our approachintegratestargetdetectionandtrackingstrategies.It continuously… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18314v1-abstract-full').style.display = 'inline'; document.getElementById('2411.18314v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.18314v1-abstract-full" style="display: none;"> Thispaperaimstoresearchandimplementa real-timevideotargettrackingalgorithmbasedon ConvolutionalNeuralNetworks(CNN),enhancingthe accuracyandrobustnessoftargettrackingincomplex scenarios.Addressingthelimitationsoftraditionaltracking algorithmsinhandlingissuessuchastargetocclusion,morphologicalchanges,andbackgroundinterference,our approachintegratestargetdetectionandtrackingstrategies.It continuouslyupdatesthetargetmodelthroughanonline learningmechanismtoadapttochangesinthetarget's appearance.Experimentalresultsdemonstratethat,when dealingwithsituationsinvolvingrapidmotion,partial occlusion,andcomplexbackgrounds,theproposedalgorithm exhibitshighertrackingsuccessratesandlowerfailurerates comparedtoseveralmainstreamtrackingalgorithms.This studysuccessfullyappliesCNNtoreal-timevideotarget tracking,improvingtheaccuracyandstabilityofthetracking algorithmwhilemaintaininghighprocessingspeeds,thus meetingthedemandsofreal-timeapplications.Thisalgorithm isexpectedtoprovidenewsolutionsfortargettrackingtasksin videosurveillanceandintelligenttransportationdomains. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18314v1-abstract-full').style.display = 'none'; document.getElementById('2411.18314v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13754">arXiv:2411.13754</a> <span> [<a href="https://arxiv.org/pdf/2411.13754">pdf</a>, <a href="https://arxiv.org/format/2411.13754">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Learning to Reason Iteratively and Parallelly for Complex Visual Reasoning Scenarios </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jaiswal%2C+S">Shantanu Jaiswal</a>, <a href="/search/cs?searchtype=author&query=Roy%2C+D">Debaditya Roy</a>, <a href="/search/cs?searchtype=author&query=Fernando%2C+B">Basura Fernando</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+C">Cheston Tan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13754v1-abstract-short" style="display: inline;"> Complex visual reasoning and question answering (VQA) is a challenging task that requires compositional multi-step processing and higher-level reasoning capabilities beyond the immediate recognition and localization of objects and events. Here, we introduce a fully neural Iterative and Parallel Reasoning Mechanism (IPRM) that combines two distinct forms of computation -- iterative and parallel --… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13754v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13754v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13754v1-abstract-full" style="display: none;"> Complex visual reasoning and question answering (VQA) is a challenging task that requires compositional multi-step processing and higher-level reasoning capabilities beyond the immediate recognition and localization of objects and events. Here, we introduce a fully neural Iterative and Parallel Reasoning Mechanism (IPRM) that combines two distinct forms of computation -- iterative and parallel -- to better address complex VQA scenarios. Specifically, IPRM's "iterative" computation facilitates compositional step-by-step reasoning for scenarios wherein individual operations need to be computed, stored, and recalled dynamically (e.g. when computing the query "determine the color of pen to the left of the child in red t-shirt sitting at the white table"). Meanwhile, its "parallel" computation allows for the simultaneous exploration of different reasoning paths and benefits more robust and efficient execution of operations that are mutually independent (e.g. when counting individual colors for the query: "determine the maximum occurring color amongst all t-shirts"). We design IPRM as a lightweight and fully-differentiable neural module that can be conveniently applied to both transformer and non-transformer vision-language backbones. It notably outperforms prior task-specific methods and transformer-based attention modules across various image and video VQA benchmarks testing distinct complex reasoning capabilities such as compositional spatiotemporal reasoning (AGQA), situational reasoning (STAR), multi-hop reasoning generalization (CLEVR-Humans) and causal event linking (CLEVRER-Humans). Further, IPRM's internal computations can be visualized across reasoning steps, aiding interpretability and diagnosis of its errors. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13754v1-abstract-full').style.display = 'none'; document.getElementById('2411.13754v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2024 camera ready; source code to be released at: https://github.com/shantanuj/IPRM_Iterative_and_Parallel_Reasoning_Mechanism</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11916">arXiv:2411.11916</a> <span> [<a href="https://arxiv.org/pdf/2411.11916">pdf</a>, <a href="https://arxiv.org/format/2411.11916">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> </div> </div> <p class="title is-5 mathjax"> From Words to Structured Visuals: A Benchmark and Framework for Text-to-Diagram Generation and Editing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wei%2C+J">Jingxuan Wei</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+C">Cheng Tan</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Q">Qi Chen</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+G">Gaowei Wu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Siyuan Li</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Zhangyang Gao</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+L">Linzhuang Sun</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+B">Bihui Yu</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+R">Ruifeng Guo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11916v1-abstract-short" style="display: inline;"> We introduce the task of text-to-diagram generation, which focuses on creating structured visual representations directly from textual descriptions. Existing approaches in text-to-image and text-to-code generation lack the logical organization and flexibility needed to produce accurate, editable diagrams, often resulting in outputs that are either unstructured or difficult to modify. To address th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11916v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11916v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11916v1-abstract-full" style="display: none;"> We introduce the task of text-to-diagram generation, which focuses on creating structured visual representations directly from textual descriptions. Existing approaches in text-to-image and text-to-code generation lack the logical organization and flexibility needed to produce accurate, editable diagrams, often resulting in outputs that are either unstructured or difficult to modify. To address this gap, we introduce DiagramGenBenchmark, a comprehensive evaluation framework encompassing eight distinct diagram categories, including flowcharts, model architecture diagrams, and mind maps. Additionally, we present DiagramAgent, an innovative framework with four core modules-Plan Agent, Code Agent, Check Agent, and Diagram-to-Code Agent-designed to facilitate both the generation and refinement of complex diagrams. Our extensive experiments, which combine objective metrics with human evaluations, demonstrate that DiagramAgent significantly outperforms existing baseline models in terms of accuracy, structural coherence, and modifiability. This work not only establishes a foundational benchmark for the text-to-diagram generation task but also introduces a powerful toolset to advance research and applications in this emerging area. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11916v1-abstract-full').style.display = 'none'; document.getElementById('2411.11916v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11044">arXiv:2411.11044</a> <span> [<a href="https://arxiv.org/pdf/2411.11044">pdf</a>, <a href="https://arxiv.org/ps/2411.11044">ps</a>, <a href="https://arxiv.org/format/2411.11044">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Efficient Federated Unlearning with Adaptive Differential Privacy Preservation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+Y">Yu Jiang</a>, <a href="/search/cs?searchtype=author&query=Tong%2C+X">Xindi Tong</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Ziyao Liu</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+H">Huanyi Ye</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+C+W">Chee Wei Tan</a>, <a href="/search/cs?searchtype=author&query=Lam%2C+K">Kwok-Yan Lam</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11044v1-abstract-short" style="display: inline;"> Federated unlearning (FU) offers a promising solution to effectively address the need to erase the impact of specific clients' data on the global model in federated learning (FL), thereby granting individuals the ``Right to be Forgotten". The most straightforward approach to achieve unlearning is to train the model from scratch, excluding clients who request data removal, but it is resource-intens… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11044v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11044v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11044v1-abstract-full" style="display: none;"> Federated unlearning (FU) offers a promising solution to effectively address the need to erase the impact of specific clients' data on the global model in federated learning (FL), thereby granting individuals the ``Right to be Forgotten". The most straightforward approach to achieve unlearning is to train the model from scratch, excluding clients who request data removal, but it is resource-intensive. Current state-of-the-art FU methods extend traditional FL frameworks by leveraging stored historical updates, enabling more efficient unlearning than training from scratch. However, the use of stored updates introduces significant privacy risks. Adversaries with access to these updates can potentially reconstruct clients' local data, a well-known vulnerability in the privacy domain. While privacy-enhanced techniques exist, their applications to FU scenarios that balance unlearning efficiency with privacy protection remain underexplored. To address this gap, we propose FedADP, a method designed to achieve both efficiency and privacy preservation in FU. Our approach incorporates an adaptive differential privacy (DP) mechanism, carefully balancing privacy and unlearning performance through a novel budget allocation strategy tailored for FU. FedADP also employs a dual-layered selection process, focusing on global models with significant changes and client updates closely aligned with the global model, reducing storage and communication costs. Additionally, a novel calibration method is introduced to facilitate effective unlearning. Extensive experimental results demonstrate that FedADP effectively manages the trade-off between unlearning efficiency and privacy protection. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11044v1-abstract-full').style.display = 'none'; document.getElementById('2411.11044v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11039">arXiv:2411.11039</a> <span> [<a href="https://arxiv.org/pdf/2411.11039">pdf</a>, <a href="https://arxiv.org/format/2411.11039">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> FedUHB: Accelerating Federated Unlearning via Polyak Heavy Ball Method </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+Y">Yu Jiang</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+C+W">Chee Wei Tan</a>, <a href="/search/cs?searchtype=author&query=Lam%2C+K">Kwok-Yan Lam</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11039v1-abstract-short" style="display: inline;"> Federated learning facilitates collaborative machine learning, enabling multiple participants to collectively develop a shared model while preserving the privacy of individual data. The growing importance of the "right to be forgotten" calls for effective mechanisms to facilitate data removal upon request. In response, federated unlearning (FU) has been developed to efficiently eliminate the influ… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11039v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11039v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11039v1-abstract-full" style="display: none;"> Federated learning facilitates collaborative machine learning, enabling multiple participants to collectively develop a shared model while preserving the privacy of individual data. The growing importance of the "right to be forgotten" calls for effective mechanisms to facilitate data removal upon request. In response, federated unlearning (FU) has been developed to efficiently eliminate the influence of specific data from the model. Current FU methods primarily rely on approximate unlearning strategies, which seek to balance data removal efficacy with computational and communication costs, but often fail to completely erase data influence. To address these limitations, we propose FedUHB, a novel exact unlearning approach that leverages the Polyak heavy ball optimization technique, a first-order method, to achieve rapid retraining. In addition, we introduce a dynamic stopping mechanism to optimize the termination of the unlearning process. Our extensive experiments show that FedUHB not only enhances unlearning efficiency but also preserves robust model performance after unlearning. Furthermore, the dynamic stopping mechanism effectively reduces the number of unlearning iterations, conserving both computational and communication resources. FedUHB can be proved as an effective and efficient solution for exact data removal in federated learning settings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11039v1-abstract-full').style.display = 'none'; document.getElementById('2411.11039v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07664">arXiv:2411.07664</a> <span> [<a href="https://arxiv.org/pdf/2411.07664">pdf</a>, <a href="https://arxiv.org/format/2411.07664">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Evaluating the Generation of Spatial Relations in Text and Image Generative Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sim%2C+S+H">Shang Hong Sim</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+C">Clarence Lee</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+A">Alvin Tan</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+C">Cheston Tan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07664v1-abstract-short" style="display: inline;"> Understanding spatial relations is a crucial cognitive ability for both humans and AI. While current research has predominantly focused on the benchmarking of text-to-image (T2I) models, we propose a more comprehensive evaluation that includes \textit{both} T2I and Large Language Models (LLMs). As spatial relations are naturally understood in a visuo-spatial manner, we develop an approach to conve… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07664v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07664v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07664v1-abstract-full" style="display: none;"> Understanding spatial relations is a crucial cognitive ability for both humans and AI. While current research has predominantly focused on the benchmarking of text-to-image (T2I) models, we propose a more comprehensive evaluation that includes \textit{both} T2I and Large Language Models (LLMs). As spatial relations are naturally understood in a visuo-spatial manner, we develop an approach to convert LLM outputs into an image, thereby allowing us to evaluate both T2I models and LLMs \textit{visually}. We examined the spatial relation understanding of 8 prominent generative models (3 T2I models and 5 LLMs) on a set of 10 common prepositions, as well as assess the feasibility of automatic evaluation methods. Surprisingly, we found that T2I models only achieve subpar performance despite their impressive general image-generation abilities. Even more surprisingly, our results show that LLMs are significantly more accurate than T2I models in generating spatial relations, despite being primarily trained on textual data. We examined reasons for model failures and highlight gaps that can be filled to enable more spatially faithful generations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07664v1-abstract-full').style.display = 'none'; document.getElementById('2411.07664v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06899">arXiv:2411.06899</a> <span> [<a href="https://arxiv.org/pdf/2411.06899">pdf</a>, <a href="https://arxiv.org/format/2411.06899">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> LongSafetyBench: Long-Context LLMs Struggle with Safety Issues </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+M">Mianqiu Huang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiaoran Liu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+S">Shaojun Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Mozhi Zhang</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+C">Chenkun Tan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+P">Pengyu Wang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Q">Qipeng Guo</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zhe Xu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Linyang Li</a>, <a href="/search/cs?searchtype=author&query=Lei%2C+Z">Zhikai Lei</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Linlin Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qun Liu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yaqian Zhou</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+X">Xipeng Qiu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+X">Xuanjing Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06899v1-abstract-short" style="display: inline;"> With the development of large language models (LLMs), the sequence length of these models continues to increase, drawing significant attention to long-context language models. However, the evaluation of these models has been primarily limited to their capabilities, with a lack of research focusing on their safety. Existing work, such as ManyShotJailbreak, has to some extent demonstrated that long-… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06899v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06899v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06899v1-abstract-full" style="display: none;"> With the development of large language models (LLMs), the sequence length of these models continues to increase, drawing significant attention to long-context language models. However, the evaluation of these models has been primarily limited to their capabilities, with a lack of research focusing on their safety. Existing work, such as ManyShotJailbreak, has to some extent demonstrated that long-context language models can exhibit safety concerns. However, the methods used are limited and lack comprehensiveness. In response, we introduce \textbf{LongSafetyBench}, the first benchmark designed to objectively and comprehensively evaluate the safety of long-context models. LongSafetyBench consists of 10 task categories, with an average length of 41,889 words. After testing eight long-context language models on LongSafetyBench, we found that existing models generally exhibit insufficient safety capabilities. The proportion of safe responses from most mainstream long-context LLMs is below 50\%. Moreover, models' safety performance in long-context scenarios does not always align with that in short-context scenarios. Further investigation revealed that long-context models tend to overlook harmful content within lengthy texts. We also proposed a simple yet effective solution, allowing open-source models to achieve performance comparable to that of top-tier closed-source models. We believe that LongSafetyBench can serve as a valuable benchmark for evaluating the safety capabilities of long-context language models. We hope that our work will encourage the broader community to pay attention to the safety of long-context models and contribute to the development of solutions to improve the safety of long-context LLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06899v1-abstract-full').style.display = 'none'; document.getElementById('2411.06899v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06491">arXiv:2411.06491</a> <span> [<a href="https://arxiv.org/pdf/2411.06491">pdf</a>, <a href="https://arxiv.org/format/2411.06491">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> </div> </div> <p class="title is-5 mathjax"> MBL-CPDP: A Multi-objective Bilevel Method for Cross-Project Defect Prediction via Automated Machine Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jiaxin Chen</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+J">Jinliang Ding</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+K+C">Kay Chen Tan</a>, <a href="/search/cs?searchtype=author&query=Qian%2C+J">Jiancheng Qian</a>, <a href="/search/cs?searchtype=author&query=Li%2C+K">Ke Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06491v1-abstract-short" style="display: inline;"> Cross-project defect prediction (CPDP) leverages machine learning (ML) techniques to proactively identify software defects, especially where project-specific data is scarce. However, developing a robust ML pipeline with optimal hyperparameters that effectively use cross-project information and yield satisfactory performance remains challenging. In this paper, we resolve this bottleneck by formulat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06491v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06491v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06491v1-abstract-full" style="display: none;"> Cross-project defect prediction (CPDP) leverages machine learning (ML) techniques to proactively identify software defects, especially where project-specific data is scarce. However, developing a robust ML pipeline with optimal hyperparameters that effectively use cross-project information and yield satisfactory performance remains challenging. In this paper, we resolve this bottleneck by formulating CPDP as a multi-objective bilevel optimization (MBLO) method, dubbed MBL-CPDP. It comprises two nested problems: the upper-level, a multi-objective combinatorial optimization problem, enhances robustness and efficiency in optimizing ML pipelines, while the lower-level problem is an expensive optimization problem that focuses on tuning their optimal hyperparameters. Due to the high-dimensional search space characterized by feature redundancy and inconsistent data distributions, the upper-level problem combines feature selection, transfer learning, and classification to leverage limited and heterogeneous historical data. Meanwhile, an ensemble learning method is proposed to capture differences in cross-project distribution and generalize across diverse datasets. Finally, a MBLO algorithm is presented to solve this problem while achieving high adaptability effectively. To evaluate the performance of MBL-CPDP, we compare it with five automated ML tools and $50$ CPDP techniques across $20$ projects. Extensive empirical results show that MBL-CPDPoutperforms the comparison methods, demonstrating its superior adaptability and comprehensive performance evaluation capability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06491v1-abstract-full').style.display = 'none'; document.getElementById('2411.06491v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">37 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06137">arXiv:2411.06137</a> <span> [<a href="https://arxiv.org/pdf/2411.06137">pdf</a>, <a href="https://arxiv.org/format/2411.06137">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> A Sharded Blockchain-Based Secure Federated Learning Framework for LEO Satellite Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+W">Wenbo Wu</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+C">Cheng Tan</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+K">Kangcheng Yang</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+Z">Zhishu Shen</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Q">Qiushi Zheng</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+J">Jiong Jin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06137v1-abstract-short" style="display: inline;"> Low Earth Orbit (LEO) satellite networks are increasingly essential for space-based artificial intelligence (AI) applications. However, as commercial use expands, LEO satellite networks face heightened cyberattack risks, especially through satellite-to-satellite communication links, which are more vulnerable than ground-based connections. As the number of operational satellites continues to grow,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06137v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06137v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06137v1-abstract-full" style="display: none;"> Low Earth Orbit (LEO) satellite networks are increasingly essential for space-based artificial intelligence (AI) applications. However, as commercial use expands, LEO satellite networks face heightened cyberattack risks, especially through satellite-to-satellite communication links, which are more vulnerable than ground-based connections. As the number of operational satellites continues to grow, addressing these security challenges becomes increasingly critical. Traditional approaches, which focus on sending models to ground stations for validation, often overlook the limited communication windows available to LEO satellites, leaving critical security risks unaddressed. To tackle these challenges, we propose a sharded blockchain-based federated learning framework for LEO networks, called SBFL-LEO. This framework improves the reliability of inter-satellite communications using blockchain technology and assigns specific roles to each satellite. Miner satellites leverage cosine similarity (CS) and Density-Based Spatial Clustering of Applications with Noise (DBSCAN) to identify malicious models and monitor each other to detect inaccurate aggregated models. Security analysis and experimental results demonstrate that our approach outperforms baseline methods in both model accuracy and energy efficiency, significantly enhancing system robustness against attacks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06137v1-abstract-full').style.display = 'none'; document.getElementById('2411.06137v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06087">arXiv:2411.06087</a> <span> [<a href="https://arxiv.org/pdf/2411.06087">pdf</a>, <a href="https://arxiv.org/format/2411.06087">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Cross-Domain Transfer Learning using Attention Latent Features for Multi-Agent Trajectory Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Loh%2C+J+Q">Jia Quan Loh</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+X">Xuewen Luo</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+F">Fan Ding</a>, <a href="/search/cs?searchtype=author&query=Tew%2C+H+H">Hwa Hui Tew</a>, <a href="/search/cs?searchtype=author&query=Loo%2C+J+Y">Junn Yong Loo</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+Z+Y">Ze Yang Ding</a>, <a href="/search/cs?searchtype=author&query=Susilawati%2C+S">Susilawati Susilawati</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+C+P">Chee Pin Tan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06087v2-abstract-short" style="display: inline;"> With the advancements of sensor hardware, traffic infrastructure and deep learning architectures, trajectory prediction of vehicles has established a solid foundation in intelligent transportation systems. However, existing solutions are often tailored to specific traffic networks at particular time periods. Consequently, deep learning models trained on one network may struggle to generalize effec… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06087v2-abstract-full').style.display = 'inline'; document.getElementById('2411.06087v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06087v2-abstract-full" style="display: none;"> With the advancements of sensor hardware, traffic infrastructure and deep learning architectures, trajectory prediction of vehicles has established a solid foundation in intelligent transportation systems. However, existing solutions are often tailored to specific traffic networks at particular time periods. Consequently, deep learning models trained on one network may struggle to generalize effectively to unseen networks. To address this, we proposed a novel spatial-temporal trajectory prediction framework that performs cross-domain adaption on the attention representation of a Transformer-based model. A graph convolutional network is also integrated to construct dynamic graph feature embeddings that accurately model the complex spatial-temporal interactions between the multi-agent vehicles across multiple traffic domains. The proposed framework is validated on two case studies involving the cross-city and cross-period settings. Experimental results show that our proposed framework achieves superior trajectory prediction and domain adaptation performances over the state-of-the-art models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06087v2-abstract-full').style.display = 'none'; document.getElementById('2411.06087v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at the IEEE International Conference on Systems, Man, and Cybernetics 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04468">arXiv:2411.04468</a> <span> [<a href="https://arxiv.org/pdf/2411.04468">pdf</a>, <a href="https://arxiv.org/format/2411.04468">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> </div> <p class="title is-5 mathjax"> Magentic-One: A Generalist Multi-Agent System for Solving Complex Tasks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fourney%2C+A">Adam Fourney</a>, <a href="/search/cs?searchtype=author&query=Bansal%2C+G">Gagan Bansal</a>, <a href="/search/cs?searchtype=author&query=Mozannar%2C+H">Hussein Mozannar</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+C">Cheng Tan</a>, <a href="/search/cs?searchtype=author&query=Salinas%2C+E">Eduardo Salinas</a>, <a href="/search/cs?searchtype=author&query=Erkang"> Erkang</a>, <a href="/search/cs?searchtype=author&query=Zhu"> Zhu</a>, <a href="/search/cs?searchtype=author&query=Niedtner%2C+F">Friederike Niedtner</a>, <a href="/search/cs?searchtype=author&query=Proebsting%2C+G">Grace Proebsting</a>, <a href="/search/cs?searchtype=author&query=Bassman%2C+G">Griffin Bassman</a>, <a href="/search/cs?searchtype=author&query=Gerrits%2C+J">Jack Gerrits</a>, <a href="/search/cs?searchtype=author&query=Alber%2C+J">Jacob Alber</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+P">Peter Chang</a>, <a href="/search/cs?searchtype=author&query=Loynd%2C+R">Ricky Loynd</a>, <a href="/search/cs?searchtype=author&query=West%2C+R">Robert West</a>, <a href="/search/cs?searchtype=author&query=Dibia%2C+V">Victor Dibia</a>, <a href="/search/cs?searchtype=author&query=Awadallah%2C+A">Ahmed Awadallah</a>, <a href="/search/cs?searchtype=author&query=Kamar%2C+E">Ece Kamar</a>, <a href="/search/cs?searchtype=author&query=Hosn%2C+R">Rafah Hosn</a>, <a href="/search/cs?searchtype=author&query=Amershi%2C+S">Saleema Amershi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04468v1-abstract-short" style="display: inline;"> Modern AI agents, driven by advances in large foundation models, promise to enhance our productivity and transform our lives by augmenting our knowledge and capabilities. To achieve this vision, AI agents must effectively plan, perform multi-step reasoning and actions, respond to novel observations, and recover from errors, to successfully complete complex tasks across a wide range of scenarios. I… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04468v1-abstract-full').style.display = 'inline'; document.getElementById('2411.04468v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04468v1-abstract-full" style="display: none;"> Modern AI agents, driven by advances in large foundation models, promise to enhance our productivity and transform our lives by augmenting our knowledge and capabilities. To achieve this vision, AI agents must effectively plan, perform multi-step reasoning and actions, respond to novel observations, and recover from errors, to successfully complete complex tasks across a wide range of scenarios. In this work, we introduce Magentic-One, a high-performing open-source agentic system for solving such tasks. Magentic-One uses a multi-agent architecture where a lead agent, the Orchestrator, plans, tracks progress, and re-plans to recover from errors. Throughout task execution, the Orchestrator directs other specialized agents to perform tasks as needed, such as operating a web browser, navigating local files, or writing and executing Python code. We show that Magentic-One achieves statistically competitive performance to the state-of-the-art on three diverse and challenging agentic benchmarks: GAIA, AssistantBench, and WebArena. Magentic-One achieves these results without modification to core agent capabilities or to how they collaborate, demonstrating progress towards generalist agentic systems. Moreover, Magentic-One's modular design allows agents to be added or removed from the team without additional prompt tuning or training, easing development and making it extensible to future scenarios. We provide an open-source implementation of Magentic-One, and we include AutoGenBench, a standalone tool for agentic evaluation. AutoGenBench provides built-in controls for repetition and isolation to run agentic benchmarks in a rigorous and contained manner -- which is important when agents' actions have side-effects. Magentic-One, AutoGenBench and detailed empirical performance evaluations of Magentic-One, including ablations and error analysis are available at https://aka.ms/magentic-one <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04468v1-abstract-full').style.display = 'none'; document.getElementById('2411.04468v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03357">arXiv:2411.03357</a> <span> [<a href="https://arxiv.org/pdf/2411.03357">pdf</a>, <a href="https://arxiv.org/format/2411.03357">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> PipeLLM: Fast and Confidential Large Language Model Services with Speculative Pipelined Encryption </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tan%2C+Y">Yifan Tan</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+C">Cheng Tan</a>, <a href="/search/cs?searchtype=author&query=Mi%2C+Z">Zeyu Mi</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Haibo Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03357v1-abstract-short" style="display: inline;"> Confidential computing on GPUs, like NVIDIA H100, mitigates the security risks of outsourced Large Language Models (LLMs) by implementing strong isolation and data encryption. Nonetheless, this encryption incurs a significant performance overhead, reaching up to 52.8 percent and 88.2 percent throughput drop when serving OPT-30B and OPT-66B, respectively. To address this challenge, we introduce Pip… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03357v1-abstract-full').style.display = 'inline'; document.getElementById('2411.03357v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03357v1-abstract-full" style="display: none;"> Confidential computing on GPUs, like NVIDIA H100, mitigates the security risks of outsourced Large Language Models (LLMs) by implementing strong isolation and data encryption. Nonetheless, this encryption incurs a significant performance overhead, reaching up to 52.8 percent and 88.2 percent throughput drop when serving OPT-30B and OPT-66B, respectively. To address this challenge, we introduce PipeLLM, a user-transparent runtime system. PipeLLM removes the overhead by overlapping the encryption and GPU computation through pipelining - an idea inspired by the CPU instruction pipelining - thereby effectively concealing the latency increase caused by encryption. The primary technical challenge is that, unlike CPUs, the encryption module lacks prior knowledge of the specific data needing encryption until it is requested by the GPUs. To this end, we propose speculative pipelined encryption to predict the data requiring encryption by analyzing the serving patterns of LLMs. Further, we have developed an efficient, low-cost pipeline relinquishing approach for instances of incorrect predictions. Our experiments on NVIDIA H100 GPU show that compared with vanilla systems without confidential computing (e.g., vLLM, PEFT, and FlexGen), PipeLLM incurs modest overhead (less than 19.6 percent in throughput) across various LLM sizes, from 13B to 175B. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03357v1-abstract-full').style.display = 'none'; document.getElementById('2411.03357v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To appear in ASPLOS 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02115">arXiv:2411.02115</a> <span> [<a href="https://arxiv.org/pdf/2411.02115">pdf</a>, <a href="https://arxiv.org/format/2411.02115">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> FedMoE-DA: Federated Mixture of Experts via Domain Aware Fine-grained Aggregation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhan%2C+Z">Ziwei Zhan</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+W">Wenkuan Zhao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yuanqing Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+W">Weijie Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiaoxi Zhang</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+C+W">Chee Wei Tan</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+C">Chuan Wu</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+D">Deke Guo</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xu Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02115v1-abstract-short" style="display: inline;"> Federated learning (FL) is a collaborative machine learning approach that enables multiple clients to train models without sharing their private data. With the rise of deep learning, large-scale models have garnered significant attention due to their exceptional performance. However, a key challenge in FL is the limitation imposed by clients with constrained computational and communication resourc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02115v1-abstract-full').style.display = 'inline'; document.getElementById('2411.02115v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02115v1-abstract-full" style="display: none;"> Federated learning (FL) is a collaborative machine learning approach that enables multiple clients to train models without sharing their private data. With the rise of deep learning, large-scale models have garnered significant attention due to their exceptional performance. However, a key challenge in FL is the limitation imposed by clients with constrained computational and communication resources, which hampers the deployment of these large models. The Mixture of Experts (MoE) architecture addresses this challenge with its sparse activation property, which reduces computational workload and communication demands during inference and updates. Additionally, MoE facilitates better personalization by allowing each expert to specialize in different subsets of the data distribution. To alleviate the communication burdens between the server and clients, we propose FedMoE-DA, a new FL model training framework that leverages the MoE architecture and incorporates a novel domain-aware, fine-grained aggregation strategy to enhance the robustness, personalizability, and communication efficiency simultaneously. Specifically, the correlation between both intra-client expert models and inter-client data heterogeneity is exploited. Moreover, we utilize peer-to-peer (P2P) communication between clients for selective expert model synchronization, thus significantly reducing the server-client transmissions. Experiments demonstrate that our FedMoE-DA achieves excellent performance while reducing the communication pressure on the server. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02115v1-abstract-full').style.display = 'none'; document.getElementById('2411.02115v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01856">arXiv:2411.01856</a> <span> [<a href="https://arxiv.org/pdf/2411.01856">pdf</a>, <a href="https://arxiv.org/format/2411.01856">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Biomolecules">q-bio.BM</span> </div> </div> <p class="title is-5 mathjax"> MeToken: Uniform Micro-environment Token Boosts Post-Translational Modification Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tan%2C+C">Cheng Tan</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+Z">Zhenxiao Cao</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Zhangyang Gao</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+L">Lirong Wu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Siyuan Li</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yufei Huang</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+J">Jun Xia</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+B">Bozhen Hu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S+Z">Stan Z. Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01856v1-abstract-short" style="display: inline;"> Post-translational modifications (PTMs) profoundly expand the complexity and functionality of the proteome, regulating protein attributes and interactions that are crucial for biological processes. Accurately predicting PTM sites and their specific types is therefore essential for elucidating protein function and understanding disease mechanisms. Existing computational approaches predominantly foc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01856v1-abstract-full').style.display = 'inline'; document.getElementById('2411.01856v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01856v1-abstract-full" style="display: none;"> Post-translational modifications (PTMs) profoundly expand the complexity and functionality of the proteome, regulating protein attributes and interactions that are crucial for biological processes. Accurately predicting PTM sites and their specific types is therefore essential for elucidating protein function and understanding disease mechanisms. Existing computational approaches predominantly focus on protein sequences to predict PTM sites, driven by the recognition of sequence-dependent motifs. However, these approaches often overlook protein structural contexts. In this work, we first compile a large-scale sequence-structure PTM dataset, which serves as the foundation for fair comparison. We introduce the MeToken model, which tokenizes the micro-environment of each amino acid, integrating both sequence and structural information into unified discrete tokens. This model not only captures the typical sequence motifs associated with PTMs but also leverages the spatial arrangements dictated by protein tertiary structures, thus providing a holistic view of the factors influencing PTM sites. Designed to address the long-tail distribution of PTM types, MeToken employs uniform sub-codebooks that ensure even the rarest PTMs are adequately represented and distinguished. We validate the effectiveness and generalizability of MeToken across multiple datasets, demonstrating its superior performance in accurately identifying PTM types. The results underscore the importance of incorporating structural data and highlight MeToken's potential in facilitating accurate and comprehensive PTM predictions, which could significantly impact proteomics research. The code and datasets are available at https://github.com/A4Bio/MeToken. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01856v1-abstract-full').style.display = 'none'; document.getElementById('2411.01856v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">26 pages, 20 figures, 10 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01825">arXiv:2411.01825</a> <span> [<a href="https://arxiv.org/pdf/2411.01825">pdf</a>, <a href="https://arxiv.org/format/2411.01825">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.3233/FAIA240727">10.3233/FAIA240727 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> FedReMa: Improving Personalized Federated Learning via Leveraging the Most Relevant Clients </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liang%2C+H">Han Liang</a>, <a href="/search/cs?searchtype=author&query=Zhan%2C+Z">Ziwei Zhan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+W">Weijie Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiaoxi Zhang</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+C+W">Chee Wei Tan</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xu Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01825v2-abstract-short" style="display: inline;"> Federated Learning (FL) is a distributed machine learning paradigm that achieves a globally robust model through decentralized computation and periodic model synthesis, primarily focusing on the global model's accuracy over aggregated datasets of all participating clients. Personalized Federated Learning (PFL) instead tailors exclusive models for each client, aiming to enhance the accuracy of clie… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01825v2-abstract-full').style.display = 'inline'; document.getElementById('2411.01825v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01825v2-abstract-full" style="display: none;"> Federated Learning (FL) is a distributed machine learning paradigm that achieves a globally robust model through decentralized computation and periodic model synthesis, primarily focusing on the global model's accuracy over aggregated datasets of all participating clients. Personalized Federated Learning (PFL) instead tailors exclusive models for each client, aiming to enhance the accuracy of clients' individual models on specific local data distributions. Despite of their wide adoption, existing FL and PFL works have yet to comprehensively address the class-imbalance issue, one of the most critical challenges within the realm of data heterogeneity in PFL and FL research. In this paper, we propose FedReMa, an efficient PFL algorithm that can tackle class-imbalance by 1) utilizing an adaptive inter-client co-learning approach to identify and harness different clients' expertise on different data classes throughout various phases of the training process, and 2) employing distinct aggregation methods for clients' feature extractors and classifiers, with the choices informed by the different roles and implications of these model components. Specifically, driven by our experimental findings on inter-client similarity dynamics, we develop critical co-learning period (CCP), wherein we introduce a module named maximum difference segmentation (MDS) to assess and manage task relevance by analyzing the similarities between clients' logits of their classifiers. Outside the CCP, we employ an additional scheme for model aggregation that utilizes historical records of each client's most relevant peers to further enhance the personalization stability. We demonstrate the superiority of our FedReMa in extensive experiments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01825v2-abstract-full').style.display = 'none'; document.getElementById('2411.01825v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 4 figures, accepted by European Conference on Artificial Intelligence (2024 ECAI)</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> In ECAI 2024 (pp. 2090-2097). IOS Press (2024) </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01817">arXiv:2411.01817</a> <span> [<a href="https://arxiv.org/pdf/2411.01817">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> High-Pass Graph Convolutional Network for Enhanced Anomaly Detection: A Novel Approach </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+S">Shelei Li</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+Y+C">Yong Chai Tan</a>, <a href="/search/cs?searchtype=author&query=Vincent%2C+T">Tai Vincent</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01817v1-abstract-short" style="display: inline;"> Graph Convolutional Network (GCN) are widely used in Graph Anomaly Detection (GAD) due to their natural compatibility with graph structures, resulting in significant performance improvements. However, most researchers approach GAD as a graph node classification task and often rely on low-pass filters or feature aggregation from neighboring nodes. This paper proposes a novel approach by introducing… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01817v1-abstract-full').style.display = 'inline'; document.getElementById('2411.01817v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01817v1-abstract-full" style="display: none;"> Graph Convolutional Network (GCN) are widely used in Graph Anomaly Detection (GAD) due to their natural compatibility with graph structures, resulting in significant performance improvements. However, most researchers approach GAD as a graph node classification task and often rely on low-pass filters or feature aggregation from neighboring nodes. This paper proposes a novel approach by introducing a High-Pass Graph Convolution Network (HP-GCN) for GAD. The proposed HP-GCN leverages high-frequency components to detect anomalies, as anomalies tend to increase high-frequency signals within the network of normal nodes. Additionally, isolated nodes, which lack interactions with other nodes, present a challenge for Graph Neural Network (GNN). To address this, the model segments the graph into isolated nodes and nodes within connected subgraphs. Isolated nodes learn their features through Multi-Layer Perceptron (MLP), enhancing detection accuracy. The model is evaluated and validated on YelpChi, Amazon, T-Finance, and T-Social datasets. The results showed that the proposed HP-GCN can achieve anomaly detection accuracy of 96.10%, 98.16%, 96.46%, and 98.94%, respectively. The findings demonstrate that the HP-GCN outperforms existing GAD methods based on spatial domain GNN as well as those using low-pass and band-pass filters in spectral domain GCN. The findings underscore the effectiveness of this method in improving anomaly detection performance. Source code can be found at: https://github.com/meteor0033/High-pass_GAD.git. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01817v1-abstract-full').style.display = 'none'; document.getElementById('2411.01817v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01475">arXiv:2411.01475</a> <span> [<a href="https://arxiv.org/pdf/2411.01475">pdf</a>, <a href="https://arxiv.org/format/2411.01475">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Interaction-Aware Trajectory Prediction for Safe Motion Planning in Autonomous Driving: A Transformer-Transfer Learning Approach </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liang%2C+J">Jinhao Liang</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+C">Chaopeng Tan</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+L">Longhao Yan</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jingyuan Zhou</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+G">Guodong Yin</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+K">Kaidi Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01475v1-abstract-short" style="display: inline;"> A critical aspect of safe and efficient motion planning for autonomous vehicles (AVs) is to handle the complex and uncertain behavior of surrounding human-driven vehicles (HDVs). Despite intensive research on driver behavior prediction, existing approaches typically overlook the interactions between AVs and HDVs assuming that HDV trajectories are not affected by AV actions. To address this gap, we… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01475v1-abstract-full').style.display = 'inline'; document.getElementById('2411.01475v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01475v1-abstract-full" style="display: none;"> A critical aspect of safe and efficient motion planning for autonomous vehicles (AVs) is to handle the complex and uncertain behavior of surrounding human-driven vehicles (HDVs). Despite intensive research on driver behavior prediction, existing approaches typically overlook the interactions between AVs and HDVs assuming that HDV trajectories are not affected by AV actions. To address this gap, we present a transformer-transfer learning-based interaction-aware trajectory predictor for safe motion planning of autonomous driving, focusing on a vehicle-to-vehicle (V2V) interaction scenario consisting of an AV and an HDV. Specifically, we construct a transformer-based interaction-aware trajectory predictor using widely available datasets of HDV trajectory data and further transfer the learned predictor using a small set of AV-HDV interaction data. Then, to better incorporate the proposed trajectory predictor into the motion planning module of AVs, we introduce an uncertainty quantification method to characterize the errors of the predictor, which are integrated into the path-planning process. Our experimental results demonstrate the value of explicitly considering interactions and handling uncertainties. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01475v1-abstract-full').style.display = 'none'; document.getElementById('2411.01475v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00666">arXiv:2411.00666</a> <span> [<a href="https://arxiv.org/pdf/2411.00666">pdf</a>, <a href="https://arxiv.org/format/2411.00666">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Beyond the Boundaries of Proximal Policy Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tan%2C+C+B">Charlie B. Tan</a>, <a href="/search/cs?searchtype=author&query=Toledo%2C+E">Edan Toledo</a>, <a href="/search/cs?searchtype=author&query=Ellis%2C+B">Benjamin Ellis</a>, <a href="/search/cs?searchtype=author&query=Foerster%2C+J+N">Jakob N. Foerster</a>, <a href="/search/cs?searchtype=author&query=Husz%C3%A1r%2C+F">Ferenc Husz谩r</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00666v1-abstract-short" style="display: inline;"> Proximal policy optimization (PPO) is a widely-used algorithm for on-policy reinforcement learning. This work offers an alternative perspective of PPO, in which it is decomposed into the inner-loop estimation of update vectors, and the outer-loop application of updates using gradient ascent with unity learning rate. Using this insight we propose outer proximal policy optimization (outer-PPO); a fr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00666v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00666v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00666v1-abstract-full" style="display: none;"> Proximal policy optimization (PPO) is a widely-used algorithm for on-policy reinforcement learning. This work offers an alternative perspective of PPO, in which it is decomposed into the inner-loop estimation of update vectors, and the outer-loop application of updates using gradient ascent with unity learning rate. Using this insight we propose outer proximal policy optimization (outer-PPO); a framework wherein these update vectors are applied using an arbitrary gradient-based optimizer. The decoupling of update estimation and update application enabled by outer-PPO highlights several implicit design choices in PPO that we challenge through empirical investigation. In particular we consider non-unity learning rates and momentum applied to the outer loop, and a momentum-bias applied to the inner estimation loop. Methods are evaluated against an aggressively tuned PPO baseline on Brax, Jumanji and MinAtar environments; non-unity learning rates and momentum both achieve statistically significant improvement on Brax and Jumanji, given the same hyperparameter tuning budget. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00666v1-abstract-full').style.display = 'none'; document.getElementById('2411.00666v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00625">arXiv:2411.00625</a> <span> [<a href="https://arxiv.org/pdf/2411.00625">pdf</a>, <a href="https://arxiv.org/format/2411.00625">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Toward Automated Algorithm Design: A Survey and Practical Guide to Meta-Black-Box-Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ma%2C+Z">Zeyuan Ma</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+H">Hongshu Guo</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+Y">Yue-Jiao Gong</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jun Zhang</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+K+C">Kay Chen Tan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00625v2-abstract-short" style="display: inline;"> In this survey, we introduce Meta-Black-Box-Optimization~(MetaBBO) as an emerging avenue within the Evolutionary Computation~(EC) community, which incorporates Meta-learning approaches to assist automated algorithm design. Despite the success of MetaBBO, the current literature provides insufficient summaries of its key aspects and lacks practical guidance for implementation. To bridge this gap, we… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00625v2-abstract-full').style.display = 'inline'; document.getElementById('2411.00625v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00625v2-abstract-full" style="display: none;"> In this survey, we introduce Meta-Black-Box-Optimization~(MetaBBO) as an emerging avenue within the Evolutionary Computation~(EC) community, which incorporates Meta-learning approaches to assist automated algorithm design. Despite the success of MetaBBO, the current literature provides insufficient summaries of its key aspects and lacks practical guidance for implementation. To bridge this gap, we offer a comprehensive review of recent advances in MetaBBO, providing an in-depth examination of its key developments. We begin with a unified definition of the MetaBBO paradigm, followed by a systematic taxonomy of various algorithm design tasks, including algorithm selection, algorithm configuration, solution manipulation, and algorithm generation. Further, we conceptually summarize different learning methodologies behind current MetaBBO works, including reinforcement learning, supervised learning, neuroevolution, and in-context learning with Large Language Models. A comprehensive evaluation of the latest representative MetaBBO methods is then carried out, alongside an experimental analysis of their optimization performance, computational efficiency, and generalization ability. Based on the evaluation results, we meticulously identify a set of core designs that enhance the generalization and learning effectiveness of MetaBBO. Finally, we outline the vision for the field by providing insight into the latest trends and potential future directions. Relevant literature will be continuously collected and updated at \url{https://github.com/GMC-DRL/Awesome-MetaBBO}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00625v2-abstract-full').style.display = 'none'; document.getElementById('2411.00625v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18687">arXiv:2410.18687</a> <span> [<a href="https://arxiv.org/pdf/2410.18687">pdf</a>, <a href="https://arxiv.org/format/2410.18687">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ODDN: Addressing Unpaired Data Challenges in Open-World Deepfake Detection on Online Social Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tao%2C+R">Renshuai Tao</a>, <a href="/search/cs?searchtype=author&query=Le%2C+M">Manyi Le</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+C">Chuangchuang Tan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Huan Liu</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+H">Haotong Qin</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yao Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18687v1-abstract-short" style="display: inline;"> Despite significant advances in deepfake detection, handling varying image quality, especially due to different compressions on online social networks (OSNs), remains challenging. Current methods succeed by leveraging correlations between paired images, whether raw or compressed. However, in open-world scenarios, paired data is scarce, with compressed images readily available but corresponding raw… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18687v1-abstract-full').style.display = 'inline'; document.getElementById('2410.18687v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18687v1-abstract-full" style="display: none;"> Despite significant advances in deepfake detection, handling varying image quality, especially due to different compressions on online social networks (OSNs), remains challenging. Current methods succeed by leveraging correlations between paired images, whether raw or compressed. However, in open-world scenarios, paired data is scarce, with compressed images readily available but corresponding raw versions difficult to obtain. This imbalance, where unpaired data vastly outnumbers paired data, often leads to reduced detection performance, as existing methods struggle without corresponding raw images. To overcome this issue, we propose a novel approach named the open-world deepfake detection network (ODDN), which comprises two core modules: open-world data aggregation (ODA) and compression-discard gradient correction (CGC). ODA effectively aggregates correlations between compressed and raw samples through both fine-grained and coarse-grained analyses for paired and unpaired data, respectively. CGC incorporates a compression-discard gradient correction to further enhance performance across diverse compression methods in OSN. This technique optimizes the training gradient to ensure the model remains insensitive to compression variations. Extensive experiments conducted on 17 popular deepfake datasets demonstrate the superiority of the ODDN over SOTA baselines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18687v1-abstract-full').style.display = 'none'; document.getElementById('2410.18687v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages, 4 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.17799">arXiv:2410.17799</a> <span> [<a href="https://arxiv.org/pdf/2410.17799">pdf</a>, <a href="https://arxiv.org/format/2410.17799">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> OmniFlatten: An End-to-end GPT Model for Seamless Voice Conversation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Qinglin Zhang</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+L">Luyao Cheng</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+C">Chong Deng</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Q">Qian Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wen Wang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+S">Siqi Zheng</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jiaqing Liu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+H">Hai Yu</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+C">Chaohong Tan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.17799v1-abstract-short" style="display: inline;"> Full-duplex spoken dialogue systems significantly advance over traditional turn-based dialogue systems, as they allow simultaneous bidirectional communication, closely mirroring human-human interactions. However, achieving low latency and natural interactions in full-duplex dialogue systems remains a significant challenge, especially considering human conversation dynamics such as interruptions, b… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17799v1-abstract-full').style.display = 'inline'; document.getElementById('2410.17799v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.17799v1-abstract-full" style="display: none;"> Full-duplex spoken dialogue systems significantly advance over traditional turn-based dialogue systems, as they allow simultaneous bidirectional communication, closely mirroring human-human interactions. However, achieving low latency and natural interactions in full-duplex dialogue systems remains a significant challenge, especially considering human conversation dynamics such as interruptions, backchannels, and overlapping speech. In this paper, we introduce a novel End-to-End GPT-based model OmniFlatten for full-duplex conversation, capable of effectively modeling the complex behaviors inherent to natural conversations with low latency. To achieve full-duplex communication capabilities, we propose a multi-stage post-training scheme that progressively adapts a text-based large language model (LLM) backbone into a speech-text dialogue LLM, capable of generating text and speech in real time, without modifying the architecture of the backbone LLM. The training process comprises three stages: modality alignment, half-duplex dialogue learning, and full-duplex dialogue learning. Throughout all training stages, we standardize the data using a flattening operation, which allows us to unify the training methods and the model architecture across different modalities and tasks. Our approach offers a straightforward modeling technique and a promising research direction for developing efficient and natural end-to-end full-duplex spoken dialogue systems. Audio samples of dialogues generated by OmniFlatten can be found at this web site (https://omniflatten.github.io/). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17799v1-abstract-full').style.display = 'none'; document.getElementById('2410.17799v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Work in progress</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.17309">arXiv:2410.17309</a> <span> [<a href="https://arxiv.org/pdf/2410.17309">pdf</a>, <a href="https://arxiv.org/format/2410.17309">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Literature Meets Data: A Synergistic Approach to Hypothesis Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+H">Haokun Liu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yangqiaoyu Zhou</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Mingxuan Li</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+C">Chenfei Yuan</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+C">Chenhao Tan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.17309v2-abstract-short" style="display: inline;"> AI holds promise for transforming scientific processes, including hypothesis generation. Prior work on hypothesis generation can be broadly categorized into theory-driven and data-driven approaches. While both have proven effective in generating novel and plausible hypotheses, it remains an open question whether they can complement each other. To address this, we develop the first method that comb… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17309v2-abstract-full').style.display = 'inline'; document.getElementById('2410.17309v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.17309v2-abstract-full" style="display: none;"> AI holds promise for transforming scientific processes, including hypothesis generation. Prior work on hypothesis generation can be broadly categorized into theory-driven and data-driven approaches. While both have proven effective in generating novel and plausible hypotheses, it remains an open question whether they can complement each other. To address this, we develop the first method that combines literature-based insights with data to perform LLM-powered hypothesis generation. We apply our method on five different datasets and demonstrate that integrating literature and data outperforms other baselines (8.97\% over few-shot, 15.75\% over literature-based alone, and 3.37\% over data-driven alone). Additionally, we conduct the first human evaluation to assess the utility of LLM-generated hypotheses in assisting human decision-making on two challenging tasks: deception detection and AI generated content detection. Our results show that human accuracy improves significantly by 7.44\% and 14.19\% on these tasks, respectively. These findings suggest that integrating literature-based and data-driven approaches provides a comprehensive and nuanced framework for hypothesis generation and could open new avenues for scientific inquiry. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17309v2-abstract-full').style.display = 'none'; document.getElementById('2410.17309v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">30 pages, 7 figures, code link: https://github.com/ChicagoHAI/hypothesis-generation</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.15908">arXiv:2410.15908</a> <span> [<a href="https://arxiv.org/pdf/2410.15908">pdf</a>, <a href="https://arxiv.org/ps/2410.15908">ps</a>, <a href="https://arxiv.org/format/2410.15908">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Programming Languages">cs.PL</span> </div> </div> <p class="title is-5 mathjax"> Formalising CXL Cache Coherence </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tan%2C+C">Chengsong Tan</a>, <a href="/search/cs?searchtype=author&query=Donaldson%2C+A+F">Alastair F. Donaldson</a>, <a href="/search/cs?searchtype=author&query=Wickerson%2C+J">John Wickerson</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.15908v1-abstract-short" style="display: inline;"> We report our experience formally modelling and verifying CXL.cache, the inter-device cache coherence protocol of the Compute Express Link standard. We have used the Isabelle proof assistant to create a formal model for CXL.cache based on the prose English specification. This led to us identifying and proposing fixes to several problems we identified as unclear, ambiguous or inaccurate, some of wh… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15908v1-abstract-full').style.display = 'inline'; document.getElementById('2410.15908v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.15908v1-abstract-full" style="display: none;"> We report our experience formally modelling and verifying CXL.cache, the inter-device cache coherence protocol of the Compute Express Link standard. We have used the Isabelle proof assistant to create a formal model for CXL.cache based on the prose English specification. This led to us identifying and proposing fixes to several problems we identified as unclear, ambiguous or inaccurate, some of which could lead to incoherence if left unfixed. Nearly all our issues and proposed fixes have been confirmed and tentatively accepted by the CXL consortium for adoption, save for one which is still under discussion. To validate the faithfulness of our model we performed scenario verification of essential restrictions such as "Snoop-pushes-GO", and produced a fully mechanised proof of a coherence property of the model. The considerable size of this proof, comprising tens of thousands of lemmas, prompted us to develop new proof automation tools, which we have made available for other Isabelle users working with similarly cumbersome proofs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15908v1-abstract-full').style.display = 'none'; document.getElementById('2410.15908v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.15620">arXiv:2410.15620</a> <span> [<a href="https://arxiv.org/pdf/2410.15620">pdf</a>, <a href="https://arxiv.org/format/2410.15620">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Acoustic Model Optimization over Multiple Data Sources: Merging and Valuation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wei%2C+V+J">Victor Junqiu Wei</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Weicheng Wang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+D">Di Jiang</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+C">Conghui Tan</a>, <a href="/search/cs?searchtype=author&query=Lian%2C+R">Rongzhong Lian</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.15620v1-abstract-short" style="display: inline;"> Due to the rising awareness of privacy protection and the voluminous scale of speech data, it is becoming infeasible for Automatic Speech Recognition (ASR) system developers to train the acoustic model with complete data as before. For example, the data may be owned by different curators, and it is not allowed to share with others. In this paper, we propose a novel paradigm to solve salient proble… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15620v1-abstract-full').style.display = 'inline'; document.getElementById('2410.15620v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.15620v1-abstract-full" style="display: none;"> Due to the rising awareness of privacy protection and the voluminous scale of speech data, it is becoming infeasible for Automatic Speech Recognition (ASR) system developers to train the acoustic model with complete data as before. For example, the data may be owned by different curators, and it is not allowed to share with others. In this paper, we propose a novel paradigm to solve salient problems plaguing the ASR field. In the first stage, multiple acoustic models are trained based upon different subsets of the complete speech data, while in the second phase, two novel algorithms are utilized to generate a high-quality acoustic model based upon those trained on data subsets. We first propose the Genetic Merge Algorithm (GMA), which is a highly specialized algorithm for optimizing acoustic models but suffers from low efficiency. We further propose the SGD-Based Optimizational Merge Algorithm (SOMA), which effectively alleviates the efficiency bottleneck of GMA and maintains superior model accuracy. Extensive experiments on public data show that the proposed methods can significantly outperform the state-of-the-art. Furthermore, we introduce Shapley Value to estimate the contribution score of the trained models, which is useful for evaluating the effectiveness of the data and providing fair incentives to their curators. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15620v1-abstract-full').style.display = 'none'; document.getElementById('2410.15620v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.15285">arXiv:2410.15285</a> <span> [<a href="https://arxiv.org/pdf/2410.15285">pdf</a>, <a href="https://arxiv.org/format/2410.15285">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Contextual Augmented Multi-Model Programming (CAMP): A Hybrid Local-Cloud Copilot Framework </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yuchen Wang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+S">Shangxin Guo</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+C+W">Chee Wei Tan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.15285v1-abstract-short" style="display: inline;"> The advancements in cloud-based Large Languages Models (LLMs) have revolutionized AI-assisted programming. However, their integration into certain local development environments like ones within the Apple software ecosystem (e.g., iOS apps, macOS) remains challenging due to computational demands and sandboxed constraints. This paper presents CAMP, a multi-model AI-assisted programming framework th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15285v1-abstract-full').style.display = 'inline'; document.getElementById('2410.15285v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.15285v1-abstract-full" style="display: none;"> The advancements in cloud-based Large Languages Models (LLMs) have revolutionized AI-assisted programming. However, their integration into certain local development environments like ones within the Apple software ecosystem (e.g., iOS apps, macOS) remains challenging due to computational demands and sandboxed constraints. This paper presents CAMP, a multi-model AI-assisted programming framework that consists of a local model that employs Retrieval-Augmented Generation (RAG) to retrieve contextual information from the codebase to facilitate context-aware prompt construction thus optimizing the performance of the cloud model, empowering LLMs' capabilities in local Integrated Development Environments (IDEs). The methodology is actualized in Copilot for Xcode, an AI-assisted programming tool crafted for Xcode that employs the RAG module to address software constraints and enables diverse generative programming tasks, including automatic code completion, documentation, error detection, and intelligent user-agent interaction. The results from objective experiments on generated code quality and subjective experiments on user adoption collectively demonstrate the pilot success of the proposed system and mark its significant contributions to the realm of AI-assisted programming. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15285v1-abstract-full').style.display = 'none'; document.getElementById('2410.15285v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 3 figures, 4 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.15010">arXiv:2410.15010</a> <span> [<a href="https://arxiv.org/pdf/2410.15010">pdf</a>, <a href="https://arxiv.org/format/2410.15010">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> FlexMol: A Flexible Toolkit for Benchmarking Molecular Relational Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+S">Sizhe Liu</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+J">Jun Xia</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Lecheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yuchen Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yue Liu</a>, <a href="/search/cs?searchtype=author&query=Du%2C+W">Wenjie Du</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Zhangyang Gao</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+B">Bozhen Hu</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+C">Cheng Tan</a>, <a href="/search/cs?searchtype=author&query=Xiang%2C+H">Hongxin Xiang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S+Z">Stan Z. Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.15010v1-abstract-short" style="display: inline;"> Molecular relational learning (MRL) is crucial for understanding the interaction behaviors between molecular pairs, a critical aspect of drug discovery and development. However, the large feasible model space of MRL poses significant challenges to benchmarking, and existing MRL frameworks face limitations in flexibility and scope. To address these challenges, avoid repetitive coding efforts, and e… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15010v1-abstract-full').style.display = 'inline'; document.getElementById('2410.15010v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.15010v1-abstract-full" style="display: none;"> Molecular relational learning (MRL) is crucial for understanding the interaction behaviors between molecular pairs, a critical aspect of drug discovery and development. However, the large feasible model space of MRL poses significant challenges to benchmarking, and existing MRL frameworks face limitations in flexibility and scope. To address these challenges, avoid repetitive coding efforts, and ensure fair comparison of models, we introduce FlexMol, a comprehensive toolkit designed to facilitate the construction and evaluation of diverse model architectures across various datasets and performance metrics. FlexMol offers a robust suite of preset model components, including 16 drug encoders, 13 protein sequence encoders, 9 protein structure encoders, and 7 interaction layers. With its easy-to-use API and flexibility, FlexMol supports the dynamic construction of over 70, 000 distinct combinations of model architectures. Additionally, we provide detailed benchmark results and code examples to demonstrate FlexMol's effectiveness in simplifying and standardizing MRL model development and comparison. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15010v1-abstract-full').style.display = 'none'; document.getElementById('2410.15010v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.14184">arXiv:2410.14184</a> <span> [<a href="https://arxiv.org/pdf/2410.14184">pdf</a>, <a href="https://arxiv.org/format/2410.14184">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> MetaAlign: Align Large Language Models with Diverse Preferences during Inference Time </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Mozhi Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+P">Pengyu Wang</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+C">Chenkun Tan</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+M">Mianqiu Huang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+D">Dong Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yaqian Zhou</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+X">Xipeng Qiu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.14184v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) acquire extensive knowledge and remarkable abilities from extensive text corpora, making them powerful tools for various applications. To make LLMs more usable, aligning them with human preferences is essential. Existing alignment techniques, such as Reinforcement Learning from Human Feedback (RLHF) and Direct Preference Optimization (DPO), typically embed predefined p… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14184v1-abstract-full').style.display = 'inline'; document.getElementById('2410.14184v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.14184v1-abstract-full" style="display: none;"> Large Language Models (LLMs) acquire extensive knowledge and remarkable abilities from extensive text corpora, making them powerful tools for various applications. To make LLMs more usable, aligning them with human preferences is essential. Existing alignment techniques, such as Reinforcement Learning from Human Feedback (RLHF) and Direct Preference Optimization (DPO), typically embed predefined preferences directly within the model's parameters. These methods, however, often result in a static alignment that can not account for the diversity of human preferences in practical applications. In response to this challenge, we propose an effective method, \textbf{MetaAlign}, which aims to help LLMs dynamically align with various explicit or implicit preferences specified at inference time. Experimental results show that LLMs optimized on our meticulously constructed MetaAlign Dataset can effectively align with any preferences specified at the inference stage, validating the feasibility of MetaAlign. We hope that our work can provide some insights into the alignment of language models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14184v1-abstract-full').style.display = 'none'; document.getElementById('2410.14184v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">19 pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.13221">arXiv:2410.13221</a> <span> [<a href="https://arxiv.org/pdf/2410.13221">pdf</a>, <a href="https://arxiv.org/format/2410.13221">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Investigating Effective Speaker Property Privacy Protection in Federated Learning for Speech Emotion Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tan%2C+C">Chao Tan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Sheng Li</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+Y">Yang Cao</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+Z">Zhao Ren</a>, <a href="/search/cs?searchtype=author&query=Schultz%2C+T">Tanja Schultz</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.13221v1-abstract-short" style="display: inline;"> Federated Learning (FL) is a privacy-preserving approach that allows servers to aggregate distributed models transmitted from local clients rather than training on user data. More recently, FL has been applied to Speech Emotion Recognition (SER) for secure human-computer interaction applications. Recent research has found that FL is still vulnerable to inference attacks. To this end, this paper fo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13221v1-abstract-full').style.display = 'inline'; document.getElementById('2410.13221v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.13221v1-abstract-full" style="display: none;"> Federated Learning (FL) is a privacy-preserving approach that allows servers to aggregate distributed models transmitted from local clients rather than training on user data. More recently, FL has been applied to Speech Emotion Recognition (SER) for secure human-computer interaction applications. Recent research has found that FL is still vulnerable to inference attacks. To this end, this paper focuses on investigating the security of FL for SER concerning property inference attacks. We propose a novel method to protect the property information in speech data by decomposing various properties in the sound and adding perturbations to these properties. Our experiments show that the proposed method offers better privacy-utility trade-offs than existing methods. The trade-offs enable more effective attack prevention while maintaining similar FL utility levels. This work can guide future work on privacy protection methods in speech processing. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13221v1-abstract-full').style.display = 'none'; document.getElementById('2410.13221v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.09875">arXiv:2410.09875</a> <span> [<a href="https://arxiv.org/pdf/2410.09875">pdf</a>, <a href="https://arxiv.org/format/2410.09875">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> ViFi-ReID: A Two-Stream Vision-WiFi Multimodal Approach for Person Re-identification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mao%2C+C">Chen Mao</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+C">Chong Tan</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+J">Jingqi Hu</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+M">Min Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.09875v1-abstract-short" style="display: inline;"> Person re-identification(ReID), as a crucial technology in the field of security, plays a vital role in safety inspections, personnel counting, and more. Most current ReID approaches primarily extract features from images, which are easily affected by objective conditions such as clothing changes and occlusions. In addition to cameras, we leverage widely available routers as sensing devices by cap… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09875v1-abstract-full').style.display = 'inline'; document.getElementById('2410.09875v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.09875v1-abstract-full" style="display: none;"> Person re-identification(ReID), as a crucial technology in the field of security, plays a vital role in safety inspections, personnel counting, and more. Most current ReID approaches primarily extract features from images, which are easily affected by objective conditions such as clothing changes and occlusions. In addition to cameras, we leverage widely available routers as sensing devices by capturing gait information from pedestrians through the Channel State Information (CSI) in WiFi signals and contribute a multimodal dataset. We employ a two-stream network to separately process video understanding and signal analysis tasks, and conduct multi-modal fusion and contrastive learning on pedestrian video and WiFi data. Extensive experiments in real-world scenarios demonstrate that our method effectively uncovers the correlations between heterogeneous data, bridges the gap between visual and signal modalities, significantly expands the sensing range, and improves ReID accuracy across multiple sensors. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09875v1-abstract-full').style.display = 'none'; document.getElementById('2410.09875v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.08207">arXiv:2410.08207</a> <span> [<a href="https://arxiv.org/pdf/2410.08207">pdf</a>, <a href="https://arxiv.org/format/2410.08207">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> DICE: Discrete Inversion Enabling Controllable Editing for Multinomial Diffusion and Masked Generative Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+X">Xiaoxiao He</a>, <a href="/search/cs?searchtype=author&query=Han%2C+L">Ligong Han</a>, <a href="/search/cs?searchtype=author&query=Dao%2C+Q">Quan Dao</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+S">Song Wen</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+M">Minhao Bai</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+D">Di Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Han Zhang</a>, <a href="/search/cs?searchtype=author&query=Min%2C+M+R">Martin Renqiang Min</a>, <a href="/search/cs?searchtype=author&query=Juefei-Xu%2C+F">Felix Juefei-Xu</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+C">Chaowei Tan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+B">Bo Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+K">Kang Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Hongdong Li</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Junzhou Huang</a>, <a href="/search/cs?searchtype=author&query=Ahmed%2C+F">Faez Ahmed</a>, <a href="/search/cs?searchtype=author&query=Srivastava%2C+A">Akash Srivastava</a>, <a href="/search/cs?searchtype=author&query=Metaxas%2C+D">Dimitris Metaxas</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.08207v1-abstract-short" style="display: inline;"> Discrete diffusion models have achieved success in tasks like image generation and masked language modeling but face limitations in controlled content editing. We introduce DICE (Discrete Inversion for Controllable Editing), the first approach to enable precise inversion for discrete diffusion models, including multinomial diffusion and masked generative models. By recording noise sequences and ma… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08207v1-abstract-full').style.display = 'inline'; document.getElementById('2410.08207v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.08207v1-abstract-full" style="display: none;"> Discrete diffusion models have achieved success in tasks like image generation and masked language modeling but face limitations in controlled content editing. We introduce DICE (Discrete Inversion for Controllable Editing), the first approach to enable precise inversion for discrete diffusion models, including multinomial diffusion and masked generative models. By recording noise sequences and masking patterns during the reverse diffusion process, DICE enables accurate reconstruction and flexible editing of discrete data without the need for predefined masks or attention manipulation. We demonstrate the effectiveness of DICE across both image and text domains, evaluating it on models such as VQ-Diffusion, Paella, and RoBERTa. Our results show that DICE preserves high data fidelity while enhancing editing capabilities, offering new opportunities for fine-grained content manipulation in discrete spaces. For project webpage, see https://hexiaoxiao-cs.github.io/DICE/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08207v1-abstract-full').style.display = 'none'; document.getElementById('2410.08207v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.08035">arXiv:2410.08035</a> <span> [<a href="https://arxiv.org/pdf/2410.08035">pdf</a>, <a href="https://arxiv.org/format/2410.08035">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> IntrinsicVoice: Empowering LLMs with Intrinsic Real-time Voice Interaction Abilities </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xin Zhang</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+X">Xiang Lyu</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Z">Zhihao Du</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Q">Qian Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+D">Dong Zhang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+H">Hangrui Hu</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+C">Chaohong Tan</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+T">Tianyu Zhao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yuxuan Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Bin Zhang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Heng Lu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yaqian Zhou</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+X">Xipeng Qiu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.08035v2-abstract-short" style="display: inline;"> Current methods of building LLMs with voice interaction capabilities rely heavily on explicit text autoregressive generation before or during speech response generation to maintain content quality, which unfortunately brings computational overhead and increases latency in multi-turn interactions. To address this, we introduce IntrinsicVoic,e an LLM designed with intrinsic real-time voice interacti… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08035v2-abstract-full').style.display = 'inline'; document.getElementById('2410.08035v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.08035v2-abstract-full" style="display: none;"> Current methods of building LLMs with voice interaction capabilities rely heavily on explicit text autoregressive generation before or during speech response generation to maintain content quality, which unfortunately brings computational overhead and increases latency in multi-turn interactions. To address this, we introduce IntrinsicVoic,e an LLM designed with intrinsic real-time voice interaction capabilities. IntrinsicVoice aims to facilitate the transfer of textual capabilities of pre-trained LLMs to the speech modality by mitigating the modality gap between text and speech. Our novelty architecture, GroupFormer, can reduce speech sequences to lengths comparable to text sequences while generating high-quality audio, significantly reducing the length difference between speech and text, speeding up inference, and alleviating long-text modeling issues. Additionally, we construct a multi-turn speech-to-speech dialogue dataset named \method-500k which includes nearly 500k turns of speech-to-speech dialogues, and a cross-modality training strategy to enhance the semantic alignment between speech and text. Experimental results demonstrate that IntrinsicVoice can generate high-quality speech response with latency lower than 100ms in multi-turn dialogue scenarios. Demos are available at https://instrinsicvoice.github.io/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08035v2-abstract-full').style.display = 'none'; document.getElementById('2410.08035v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.05252">arXiv:2410.05252</a> <span> [<a href="https://arxiv.org/pdf/2410.05252">pdf</a>, <a href="https://arxiv.org/format/2410.05252">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Causal Micro-Narratives </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Heddaya%2C+M">Mourad Heddaya</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+Q">Qingcheng Zeng</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+C">Chenhao Tan</a>, <a href="/search/cs?searchtype=author&query=Voigt%2C+R">Rob Voigt</a>, <a href="/search/cs?searchtype=author&query=Zentefis%2C+A">Alexander Zentefis</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.05252v1-abstract-short" style="display: inline;"> We present a novel approach to classify causal micro-narratives from text. These narratives are sentence-level explanations of the cause(s) and/or effect(s) of a target subject. The approach requires only a subject-specific ontology of causes and effects, and we demonstrate it with an application to inflation narratives. Using a human-annotated dataset spanning historical and contemporary US news… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05252v1-abstract-full').style.display = 'inline'; document.getElementById('2410.05252v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.05252v1-abstract-full" style="display: none;"> We present a novel approach to classify causal micro-narratives from text. These narratives are sentence-level explanations of the cause(s) and/or effect(s) of a target subject. The approach requires only a subject-specific ontology of causes and effects, and we demonstrate it with an application to inflation narratives. Using a human-annotated dataset spanning historical and contemporary US news articles for training, we evaluate several large language models (LLMs) on this multi-label classification task. The best-performing model--a fine-tuned Llama 3.1 8B--achieves F1 scores of 0.87 on narrative detection and 0.71 on narrative classification. Comprehensive error analysis reveals challenges arising from linguistic ambiguity and highlights how model errors often mirror human annotator disagreements. This research establishes a framework for extracting causal micro-narratives from real-world data, with wide-ranging applications to social science research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05252v1-abstract-full').style.display = 'none'; document.getElementById('2410.05252v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to EMNLP 2024 Workshop on Narrative Understanding</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Proceedings of the The 6th Workshop on Narrative Understanding at EMNLP 2024, pages 67-84, Miami, Florida, USA. Association for Computational Linguistics </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.04785">arXiv:2410.04785</a> <span> [<a href="https://arxiv.org/pdf/2410.04785">pdf</a>, <a href="https://arxiv.org/format/2410.04785">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Towards Ultra-Low-Power Neuromorphic Speech Enhancement with Spiking-FullSubNet </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hao%2C+X">Xiang Hao</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+C">Chenxiang Ma</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Q">Qu Yang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jibin Wu</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+K+C">Kay Chen Tan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.04785v1-abstract-short" style="display: inline;"> Speech enhancement is critical for improving speech intelligibility and quality in various audio devices. In recent years, deep learning-based methods have significantly improved speech enhancement performance, but they often come with a high computational cost, which is prohibitive for a large number of edge devices, such as headsets and hearing aids. This work proposes an ultra-low-power speech… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04785v1-abstract-full').style.display = 'inline'; document.getElementById('2410.04785v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.04785v1-abstract-full" style="display: none;"> Speech enhancement is critical for improving speech intelligibility and quality in various audio devices. In recent years, deep learning-based methods have significantly improved speech enhancement performance, but they often come with a high computational cost, which is prohibitive for a large number of edge devices, such as headsets and hearing aids. This work proposes an ultra-low-power speech enhancement system based on the brain-inspired spiking neural network (SNN) called Spiking-FullSubNet. Spiking-FullSubNet follows a full-band and sub-band fusioned approach to effectively capture both global and local spectral information. To enhance the efficiency of computationally expensive sub-band modeling, we introduce a frequency partitioning method inspired by the sensitivity profile of the human peripheral auditory system. Furthermore, we introduce a novel spiking neuron model that can dynamically control the input information integration and forgetting, enhancing the multi-scale temporal processing capability of SNN, which is critical for speech denoising. Experiments conducted on the recent Intel Neuromorphic Deep Noise Suppression (N-DNS) Challenge dataset show that the Spiking-FullSubNet surpasses state-of-the-art methods by large margins in terms of both speech quality and energy efficiency metrics. Notably, our system won the championship of the Intel N-DNS Challenge (Algorithmic Track), opening up a myriad of opportunities for ultra-low-power speech enhancement at the edge. Our source code and model checkpoints are publicly available at https://github.com/haoxiangsnr/spiking-fullsubnet. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04785v1-abstract-full').style.display = 'none'; document.getElementById('2410.04785v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">under review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.18988">arXiv:2409.18988</a> <span> [<a href="https://arxiv.org/pdf/2409.18988">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="General Economics">econ.GN</span> </div> </div> <p class="title is-5 mathjax"> A Unified Framework to Classify Business Activities into International Standard Industrial Classification through Large Language Models for Circular Economy </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiang Li</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+L">Lan Zhao</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+J">Junhao Ren</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yajuan Sun</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+C+F">Chuan Fu Tan</a>, <a href="/search/cs?searchtype=author&query=Yeo%2C+Z">Zhiquan Yeo</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+G">Gaoxi Xiao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.18988v1-abstract-short" style="display: inline;"> Effective information gathering and knowledge codification are pivotal for developing recommendation systems that promote circular economy practices. One promising approach involves the creation of a centralized knowledge repository cataloguing historical waste-to-resource transactions, which subsequently enables the generation of recommendations based on past successes. However, a significant bar… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.18988v1-abstract-full').style.display = 'inline'; document.getElementById('2409.18988v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.18988v1-abstract-full" style="display: none;"> Effective information gathering and knowledge codification are pivotal for developing recommendation systems that promote circular economy practices. One promising approach involves the creation of a centralized knowledge repository cataloguing historical waste-to-resource transactions, which subsequently enables the generation of recommendations based on past successes. However, a significant barrier to constructing such a knowledge repository lies in the absence of a universally standardized framework for representing business activities across disparate geographical regions. To address this challenge, this paper leverages Large Language Models (LLMs) to classify textual data describing economic activities into the International Standard Industrial Classification (ISIC), a globally recognized economic activity classification framework. This approach enables any economic activity descriptions provided by businesses worldwide to be categorized into the unified ISIC standard, facilitating the creation of a centralized knowledge repository. Our approach achieves a 95% accuracy rate on a 182-label test dataset with fine-tuned GPT-2 model. This research contributes to the global endeavour of fostering sustainable circular economy practices by providing a standardized foundation for knowledge codification and recommendation systems deployable across regions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.18988v1-abstract-full').style.display = 'none'; document.getElementById('2409.18988v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">6 pages, 2 figures, accepted in 2024 IEEE International Conference on Industrial Engineering and Engineering Management (IEEM 2024)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.18893">arXiv:2409.18893</a> <span> [<a href="https://arxiv.org/pdf/2409.18893">pdf</a>, <a href="https://arxiv.org/format/2409.18893">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> HM3: Hierarchical Multi-Objective Model Merging for Pretrained Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yu Zhou</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+X">Xingyu Wu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jibin Wu</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+L">Liang Feng</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+K+C">Kay Chen Tan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.18893v1-abstract-short" style="display: inline;"> Model merging is a technique that combines multiple large pretrained models into a single model with enhanced performance and broader task adaptability. It has gained popularity in large pretrained model development due to its ability to bypass the need for original training data and further training processes. However, most existing model merging approaches focus solely on exploring the parameter… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.18893v1-abstract-full').style.display = 'inline'; document.getElementById('2409.18893v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.18893v1-abstract-full" style="display: none;"> Model merging is a technique that combines multiple large pretrained models into a single model with enhanced performance and broader task adaptability. It has gained popularity in large pretrained model development due to its ability to bypass the need for original training data and further training processes. However, most existing model merging approaches focus solely on exploring the parameter space, merging models with identical architectures. Merging within the architecture space, despite its potential, remains in its early stages due to the vast search space and the challenges of layer compatibility. This paper marks a significant advance toward more flexible and comprehensive model merging techniques by modeling the architecture-space merging process as a reinforcement learning task. We train policy and value networks using offline sampling of weight vectors, which are then employed for the online optimization of merging strategies. Moreover, a multi-objective optimization paradigm is introduced to accommodate users' diverse task preferences, learning the Pareto front of optimal models to offer customized merging suggestions. Experimental results across multiple tasks, including text translation, mathematical reasoning, and code generation, validate the effectiveness and superiority of the proposed framework in model merging. The code will be made publicly available after the review process. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.18893v1-abstract-full').style.display = 'none'; document.getElementById('2409.18893v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.14801">arXiv:2409.14801</a> <span> [<a href="https://arxiv.org/pdf/2409.14801">pdf</a>, <a href="https://arxiv.org/format/2409.14801">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> MTP: A Dataset for Multi-Modal Turning Points in Casual Conversations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ho%2C+G+D">Gia-Bao Dinh Ho</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+C+W">Chang Wei Tan</a>, <a href="/search/cs?searchtype=author&query=Darban%2C+Z+Z">Zahra Zamanzadeh Darban</a>, <a href="/search/cs?searchtype=author&query=Salehi%2C+M">Mahsa Salehi</a>, <a href="/search/cs?searchtype=author&query=Haffari%2C+G">Gholamreza Haffari</a>, <a href="/search/cs?searchtype=author&query=Buntine%2C+W">Wray Buntine</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.14801v1-abstract-short" style="display: inline;"> Detecting critical moments, such as emotional outbursts or changes in decisions during conversations, is crucial for understanding shifts in human behavior and their consequences. Our work introduces a novel problem setting focusing on these moments as turning points (TPs), accompanied by a meticulously curated, high-consensus, human-annotated multi-modal dataset. We provide precise timestamps, de… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.14801v1-abstract-full').style.display = 'inline'; document.getElementById('2409.14801v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.14801v1-abstract-full" style="display: none;"> Detecting critical moments, such as emotional outbursts or changes in decisions during conversations, is crucial for understanding shifts in human behavior and their consequences. Our work introduces a novel problem setting focusing on these moments as turning points (TPs), accompanied by a meticulously curated, high-consensus, human-annotated multi-modal dataset. We provide precise timestamps, descriptions, and visual-textual evidence high-lighting changes in emotions, behaviors, perspectives, and decisions at these turning points. We also propose a framework, TPMaven, utilizing state-of-the-art vision-language models to construct a narrative from the videos and large language models to classify and detect turning points in our multi-modal dataset. Evaluation results show that TPMaven achieves an F1-score of 0.88 in classification and 0.61 in detection, with additional explanations aligning with human expectations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.14801v1-abstract-full').style.display = 'none'; document.getElementById('2409.14801v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ACL 2024 main conference</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.12964">arXiv:2409.12964</a> <span> [<a href="https://arxiv.org/pdf/2409.12964">pdf</a>, <a href="https://arxiv.org/format/2409.12964">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> OpenRANet: Neuralized Spectrum Access by Joint Subcarrier and Power Allocation with Optimization-based Deep Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+S">Siya Chen</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+C+W">Chee Wei Tan</a>, <a href="/search/cs?searchtype=author&query=Zhai%2C+X">Xiangping Zhai</a>, <a href="/search/cs?searchtype=author&query=Poor%2C+H+V">H. Vincent Poor</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.12964v1-abstract-short" style="display: inline;"> The next-generation radio access network (RAN), known as Open RAN, is poised to feature an AI-native interface for wireless cellular networks, including emerging satellite-terrestrial systems, making deep learning integral to its operation. In this paper, we address the nonconvex optimization challenge of joint subcarrier and power allocation in Open RAN, with the objective of minimizing the total… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12964v1-abstract-full').style.display = 'inline'; document.getElementById('2409.12964v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.12964v1-abstract-full" style="display: none;"> The next-generation radio access network (RAN), known as Open RAN, is poised to feature an AI-native interface for wireless cellular networks, including emerging satellite-terrestrial systems, making deep learning integral to its operation. In this paper, we address the nonconvex optimization challenge of joint subcarrier and power allocation in Open RAN, with the objective of minimizing the total power consumption while ensuring users meet their transmission data rate requirements. We propose OpenRANet, an optimization-based deep learning model that integrates machine-learning techniques with iterative optimization algorithms. We start by transforming the original nonconvex problem into convex subproblems through decoupling, variable transformation, and relaxation techniques. These subproblems are then efficiently solved using iterative methods within the standard interference function framework, enabling the derivation of primal-dual solutions. These solutions integrate seamlessly as a convex optimization layer within OpenRANet, enhancing constraint adherence, solution accuracy, and computational efficiency by combining machine learning with convex analysis, as shown in numerical experiments. OpenRANet also serves as a foundation for designing resource-constrained AI-native wireless optimization strategies for broader scenarios like multi-cell systems, satellite-terrestrial networks, and future Open RAN deployments with complex power consumption requirements. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12964v1-abstract-full').style.display = 'none'; document.getElementById('2409.12964v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.10897">arXiv:2409.10897</a> <span> [<a href="https://arxiv.org/pdf/2409.10897">pdf</a>, <a href="https://arxiv.org/format/2409.10897">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> AutoSpec: Automated Generation of Neural Network Specifications </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jin%2C+S">Shuowei Jin</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+F+Y">Francis Y. Yan</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+C">Cheng Tan</a>, <a href="/search/cs?searchtype=author&query=Kalia%2C+A">Anuj Kalia</a>, <a href="/search/cs?searchtype=author&query=Foukas%2C+X">Xenofon Foukas</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+Z+M">Z. Morley Mao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.10897v2-abstract-short" style="display: inline;"> The increasing adoption of neural networks in learning-augmented systems highlights the importance of model safety and robustness, particularly in safety-critical domains. Despite progress in the formal verification of neural networks, current practices require users to manually define model specifications -- properties that dictate expected model behavior in various scenarios. This manual process… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.10897v2-abstract-full').style.display = 'inline'; document.getElementById('2409.10897v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.10897v2-abstract-full" style="display: none;"> The increasing adoption of neural networks in learning-augmented systems highlights the importance of model safety and robustness, particularly in safety-critical domains. Despite progress in the formal verification of neural networks, current practices require users to manually define model specifications -- properties that dictate expected model behavior in various scenarios. This manual process, however, is prone to human error, limited in scope, and time-consuming. In this paper, we introduce AutoSpec, the first framework to automatically generate comprehensive and accurate specifications for neural networks in learning-augmented systems. We also propose the first set of metrics for assessing the accuracy and coverage of model specifications, establishing a benchmark for future comparisons. Our evaluation across four distinct applications shows that AutoSpec outperforms human-defined specifications as well as two baseline approaches introduced in this study. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.10897v2-abstract-full').style.display = 'none'; document.getElementById('2409.10897v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.05573">arXiv:2409.05573</a> <span> [<a href="https://arxiv.org/pdf/2409.05573">pdf</a>, <a href="https://arxiv.org/format/2409.05573">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Learning to Model Graph Structural Information on MLPs via Graph Structure Self-Contrasting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+L">Lirong Wu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H">Haitao Lin</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+G">Guojiang Zhao</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+C">Cheng Tan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S+Z">Stan Z. Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.05573v1-abstract-short" style="display: inline;"> Recent years have witnessed great success in handling graph-related tasks with Graph Neural Networks (GNNs). However, most existing GNNs are based on message passing to perform feature aggregation and transformation, where the structural information is explicitly involved in the forward propagation by coupling with node features through graph convolution at each layer. As a result, subtle feature… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.05573v1-abstract-full').style.display = 'inline'; document.getElementById('2409.05573v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.05573v1-abstract-full" style="display: none;"> Recent years have witnessed great success in handling graph-related tasks with Graph Neural Networks (GNNs). However, most existing GNNs are based on message passing to perform feature aggregation and transformation, where the structural information is explicitly involved in the forward propagation by coupling with node features through graph convolution at each layer. As a result, subtle feature noise or structure perturbation may cause severe error propagation, resulting in extremely poor robustness. In this paper, we rethink the roles played by graph structural information in graph data training and identify that message passing is not the only path to modeling structural information. Inspired by this, we propose a simple but effective Graph Structure Self-Contrasting (GSSC) framework that learns graph structural information without message passing. The proposed framework is based purely on Multi-Layer Perceptrons (MLPs), where the structural information is only implicitly incorporated as prior knowledge to guide the computation of supervision signals, substituting the explicit message propagation as in GNNs. Specifically, it first applies structural sparsification to remove potentially uninformative or noisy edges in the neighborhood, and then performs structural self-contrasting in the sparsified neighborhood to learn robust node representations. Finally, structural sparsification and self-contrasting are formulated as a bi-level optimization problem and solved in a unified framework. Extensive experiments have qualitatively and quantitatively demonstrated that the GSSC framework can produce truly encouraging performance with better generalization and robustness than other leading competitors. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.05573v1-abstract-full').style.display = 'none'; document.getElementById('2409.05573v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.05423">arXiv:2409.05423</a> <span> [<a href="https://arxiv.org/pdf/2409.05423">pdf</a>, <a href="https://arxiv.org/format/2409.05423">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> STLM Engineering Report: Dropout </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hillier%2C+D">Dylan Hillier</a>, <a href="/search/cs?searchtype=author&query=Guertler%2C+L">Leon Guertler</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+B">Bobby Cheng</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+C">Cheston Tan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.05423v1-abstract-short" style="display: inline;"> In this work we explore the relevance of dropout for modern language models, particularly in the context of models on the scale of <100M parameters. We explore it's relevance firstly in the regime of improving the sample efficiency of models given small, high quality datasets, and secondly in the regime of improving the quality of its fit on larger datasets where models may underfit. We find that… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.05423v1-abstract-full').style.display = 'inline'; document.getElementById('2409.05423v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.05423v1-abstract-full" style="display: none;"> In this work we explore the relevance of dropout for modern language models, particularly in the context of models on the scale of <100M parameters. We explore it's relevance firstly in the regime of improving the sample efficiency of models given small, high quality datasets, and secondly in the regime of improving the quality of its fit on larger datasets where models may underfit. We find that concordant with conventional wisdom, dropout remains effective in the overfitting scenario, and that furthermore it may have some relevance for improving the fit of models even in the case of excess data, as suggested by previous research. In the process we find that the existing explanation for the mechanism behind this performance gain is not applicable in the case of language modelling. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.05423v1-abstract-full').style.display = 'none'; document.getElementById('2409.05423v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">6 pages, 3 figures, For code base see https://github.com/LeonGuertler/SuperTinyLanguageModels</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.7 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.04270">arXiv:2409.04270</a> <span> [<a href="https://arxiv.org/pdf/2409.04270">pdf</a>, <a href="https://arxiv.org/format/2409.04270">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> </div> </div> <p class="title is-5 mathjax"> Advancing Automated Knowledge Transfer in Evolutionary Multitasking via Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yuxiao Huang</a>, <a href="/search/cs?searchtype=author&query=Lv%2C+X">Xuebin Lv</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+S">Shenghao Wu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jibin Wu</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+L">Liang Feng</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+K+C">Kay Chen Tan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.04270v1-abstract-short" style="display: inline;"> Evolutionary Multi-task Optimization (EMTO) is a paradigm that leverages knowledge transfer across simultaneously optimized tasks for enhanced search performance. To facilitate EMTO's performance, various knowledge transfer models have been developed for specific optimization tasks. However, designing these models often requires substantial expert knowledge. Recently, large language models (LLMs)… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.04270v1-abstract-full').style.display = 'inline'; document.getElementById('2409.04270v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.04270v1-abstract-full" style="display: none;"> Evolutionary Multi-task Optimization (EMTO) is a paradigm that leverages knowledge transfer across simultaneously optimized tasks for enhanced search performance. To facilitate EMTO's performance, various knowledge transfer models have been developed for specific optimization tasks. However, designing these models often requires substantial expert knowledge. Recently, large language models (LLMs) have achieved remarkable success in autonomous programming, aiming to produce effective solvers for specific problems. In this work, a LLM-based optimization paradigm is introduced to establish an autonomous model factory for generating knowledge transfer models, ensuring effective and efficient knowledge transfer across various optimization tasks. To evaluate the performance of the proposed method, we conducted comprehensive empirical studies comparing the knowledge transfer model generated by the LLM with existing state-of-the-art knowledge transfer methods. The results demonstrate that the generated model is able to achieve superior or competitive performance against hand-crafted knowledge transfer models in terms of both efficiency and effectiveness. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.04270v1-abstract-full').style.display = 'none'; document.getElementById('2409.04270v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 11 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.03320">arXiv:2409.03320</a> <span> [<a href="https://arxiv.org/pdf/2409.03320">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> YOLO-PPA based Efficient Traffic Sign Detection for Cruise Control in Autonomous Driving </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jingyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wenqing Zhang</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+C">Chaoyi Tan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiangtian Li</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Q">Qianyi Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.03320v1-abstract-short" style="display: inline;"> It is very important to detect traffic signs efficiently and accurately in autonomous driving systems. However, the farther the distance, the smaller the traffic signs. Existing object detection algorithms can hardly detect these small scaled signs.In addition, the performance of embedded devices on vehicles limits the scale of detection models.To address these challenges, a YOLO PPA based traffic… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.03320v1-abstract-full').style.display = 'inline'; document.getElementById('2409.03320v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.03320v1-abstract-full" style="display: none;"> It is very important to detect traffic signs efficiently and accurately in autonomous driving systems. However, the farther the distance, the smaller the traffic signs. Existing object detection algorithms can hardly detect these small scaled signs.In addition, the performance of embedded devices on vehicles limits the scale of detection models.To address these challenges, a YOLO PPA based traffic sign detection algorithm is proposed in this paper.The experimental results on the GTSDB dataset show that compared to the original YOLO, the proposed method improves inference efficiency by 11.2%. The mAP 50 is also improved by 93.2%, which demonstrates the effectiveness of the proposed YOLO PPA. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.03320v1-abstract-full').style.display = 'none'; document.getElementById('2409.03320v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.00106">arXiv:2409.00106</a> <span> [<a href="https://arxiv.org/pdf/2409.00106">pdf</a>, <a href="https://arxiv.org/format/2409.00106">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Zero-Shot Visual Reasoning by Vision-Language Models: Benchmarking and Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Nagar%2C+A">Aishik Nagar</a>, <a href="/search/cs?searchtype=author&query=Jaiswal%2C+S">Shantanu Jaiswal</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+C">Cheston Tan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.00106v1-abstract-short" style="display: inline;"> Vision-language models (VLMs) have shown impressive zero- and few-shot performance on real-world visual question answering (VQA) benchmarks, alluding to their capabilities as visual reasoning engines. However, the benchmarks being used conflate "pure" visual reasoning with world knowledge, and also have questions that involve a limited number of reasoning steps. Thus, it remains unclear whether a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00106v1-abstract-full').style.display = 'inline'; document.getElementById('2409.00106v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.00106v1-abstract-full" style="display: none;"> Vision-language models (VLMs) have shown impressive zero- and few-shot performance on real-world visual question answering (VQA) benchmarks, alluding to their capabilities as visual reasoning engines. However, the benchmarks being used conflate "pure" visual reasoning with world knowledge, and also have questions that involve a limited number of reasoning steps. Thus, it remains unclear whether a VLM's apparent visual reasoning performance is due to its world knowledge, or due to actual visual reasoning capabilities. To clarify this ambiguity, we systematically benchmark and dissect the zero-shot visual reasoning capabilities of VLMs through synthetic datasets that require minimal world knowledge, and allow for analysis over a broad range of reasoning steps. We focus on two novel aspects of zero-shot visual reasoning: i) evaluating the impact of conveying scene information as either visual embeddings or purely textual scene descriptions to the underlying large language model (LLM) of the VLM, and ii) comparing the effectiveness of chain-of-thought prompting to standard prompting for zero-shot visual reasoning. We find that the underlying LLMs, when provided textual scene descriptions, consistently perform better compared to being provided visual embeddings. In particular, 18% higher accuracy is achieved on the PTR dataset. We also find that CoT prompting performs marginally better than standard prompting only for the comparatively large GPT-3.5-Turbo (175B) model, and does worse for smaller-scale models. This suggests the emergence of CoT abilities for visual reasoning in LLMs at larger scales even when world knowledge is limited. Overall, we find limitations in the abilities of VLMs and LLMs for more complex visual reasoning, and highlight the important role that LLMs can play in visual reasoning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00106v1-abstract-full').style.display = 'none'; document.getElementById('2409.00106v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">21 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.15903">arXiv:2408.15903</a> <span> [<a href="https://arxiv.org/pdf/2408.15903">pdf</a>, <a href="https://arxiv.org/format/2408.15903">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> LLM-Based Multi-Hop Question Answering with Knowledge Graph Integration in Evolving Environments </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+R">Ruirui Chen</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+W">Weifeng Jiang</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+C">Chengwei Qin</a>, <a href="/search/cs?searchtype=author&query=Rawal%2C+I+S">Ishaan Singh Rawal</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+C">Cheston Tan</a>, <a href="/search/cs?searchtype=author&query=Choi%2C+D">Dongkyu Choi</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+B">Bo Xiong</a>, <a href="/search/cs?searchtype=author&query=Ai%2C+B">Bo Ai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.15903v1-abstract-short" style="display: inline;"> The rapid obsolescence of information in Large Language Models (LLMs) has driven the development of various techniques to incorporate new facts. However, existing methods for knowledge editing still face difficulties with multi-hop questions that require accurate fact identification and sequential logical reasoning, particularly among numerous fact updates. To tackle these challenges, this paper i… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.15903v1-abstract-full').style.display = 'inline'; document.getElementById('2408.15903v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.15903v1-abstract-full" style="display: none;"> The rapid obsolescence of information in Large Language Models (LLMs) has driven the development of various techniques to incorporate new facts. However, existing methods for knowledge editing still face difficulties with multi-hop questions that require accurate fact identification and sequential logical reasoning, particularly among numerous fact updates. To tackle these challenges, this paper introduces Graph Memory-based Editing for Large Language Models (GMeLLo), a straitforward and effective method that merges the explicit knowledge representation of Knowledge Graphs (KGs) with the linguistic flexibility of LLMs. Beyond merely leveraging LLMs for question answering, GMeLLo employs these models to convert free-form language into structured queries and fact triples, facilitating seamless interaction with KGs for rapid updates and precise multi-hop reasoning. Our results show that GMeLLo significantly surpasses current state-of-the-art knowledge editing methods in the multi-hop question answering benchmark, MQuAKE, especially in scenarios with extensive knowledge edits. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.15903v1-abstract-full').style.display = 'none'; document.getElementById('2408.15903v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.14917">arXiv:2408.14917</a> <span> [<a href="https://arxiv.org/pdf/2408.14917">pdf</a>, <a href="https://arxiv.org/format/2408.14917">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> </div> </div> <p class="title is-5 mathjax"> PMSN: A Parallel Multi-compartment Spiking Neuron for Multi-scale Temporal Processing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xinyi Chen</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jibin Wu</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+C">Chenxiang Ma</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+Y">Yinsong Yan</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yujie Wu</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+K+C">Kay Chen Tan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.14917v1-abstract-short" style="display: inline;"> Spiking Neural Networks (SNNs) hold great potential to realize brain-inspired, energy-efficient computational systems. However, current SNNs still fall short in terms of multi-scale temporal processing compared to their biological counterparts. This limitation has resulted in poor performance in many pattern recognition tasks with information that varies across different timescales. To address thi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.14917v1-abstract-full').style.display = 'inline'; document.getElementById('2408.14917v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.14917v1-abstract-full" style="display: none;"> Spiking Neural Networks (SNNs) hold great potential to realize brain-inspired, energy-efficient computational systems. However, current SNNs still fall short in terms of multi-scale temporal processing compared to their biological counterparts. This limitation has resulted in poor performance in many pattern recognition tasks with information that varies across different timescales. To address this issue, we put forward a novel spiking neuron model called Parallel Multi-compartment Spiking Neuron (PMSN). The PMSN emulates biological neurons by incorporating multiple interacting substructures and allows for flexible adjustment of the substructure counts to effectively represent temporal information across diverse timescales. Additionally, to address the computational burden associated with the increased complexity of the proposed model, we introduce two parallelization techniques that decouple the temporal dependencies of neuronal updates, enabling parallelized training across different time steps. Our experimental results on a wide range of pattern recognition tasks demonstrate the superiority of PMSN. It outperforms other state-of-the-art spiking neuron models in terms of its temporal processing capacity, training speed, and computation cost. Specifically, compared with the commonly used Leaky Integrate-and-Fire neuron, PMSN offers a simulation acceleration of over 10 $\times$ and a 30 % improvement in accuracy on Sequential CIFAR10 dataset, while maintaining comparable computational cost. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.14917v1-abstract-full').style.display = 'none'; document.getElementById('2408.14917v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.13987">arXiv:2408.13987</a> <span> [<a href="https://arxiv.org/pdf/2408.13987">pdf</a>, <a href="https://arxiv.org/format/2408.13987">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Focused Large Language Models are Stable Many-Shot Learners </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yuan%2C+P">Peiwen Yuan</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+S">Shaoxiong Feng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yiwei Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xinglin Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yueqi Zhang</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+C">Chuyi Tan</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+B">Boyuan Pan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Heda Wang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yao Hu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+K">Kan Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.13987v1-abstract-short" style="display: inline;"> In-Context Learning (ICL) enables large language models (LLMs) to achieve rapid task adaptation by learning from demonstrations. With the increase in available context length of LLMs, recent experiments have shown that the performance of ICL does not necessarily scale well in many-shot (demonstration) settings. We theoretically and experimentally confirm that the reason lies in more demonstrations… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.13987v1-abstract-full').style.display = 'inline'; document.getElementById('2408.13987v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.13987v1-abstract-full" style="display: none;"> In-Context Learning (ICL) enables large language models (LLMs) to achieve rapid task adaptation by learning from demonstrations. With the increase in available context length of LLMs, recent experiments have shown that the performance of ICL does not necessarily scale well in many-shot (demonstration) settings. We theoretically and experimentally confirm that the reason lies in more demonstrations dispersing the model attention from the query, hindering its understanding of key content. Inspired by how humans learn from examples, we propose a training-free method FocusICL, which conducts triviality filtering to avoid attention being diverted by unimportant contents at token-level and operates hierarchical attention to further ensure sufficient attention towards current query at demonstration-level. We also design an efficient hyperparameter searching strategy for FocusICL based on model perplexity of demonstrations. Comprehensive experiments validate that FocusICL achieves an average performance improvement of 5.2% over vanilla ICL and scales well with many-shot demonstrations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.13987v1-abstract-full').style.display = 'none'; document.getElementById('2408.13987v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.11330">arXiv:2408.11330</a> <span> [<a href="https://arxiv.org/pdf/2408.11330">pdf</a>, <a href="https://arxiv.org/format/2408.11330">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Design Principle Transfer in Neural Architecture Search via Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+X">Xun Zhou</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+L">Liang Feng</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+X">Xingyu Wu</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Z">Zhichao Lu</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+K+C">Kay Chen Tan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.11330v1-abstract-short" style="display: inline;"> Transferable neural architecture search (TNAS) has been introduced to design efficient neural architectures for multiple tasks, to enhance the practical applicability of NAS in real-world scenarios. In TNAS, architectural knowledge accumulated in previous search processes is reused to warm up the architecture search for new tasks. However, existing TNAS methods still search in an extensive search… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.11330v1-abstract-full').style.display = 'inline'; document.getElementById('2408.11330v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.11330v1-abstract-full" style="display: none;"> Transferable neural architecture search (TNAS) has been introduced to design efficient neural architectures for multiple tasks, to enhance the practical applicability of NAS in real-world scenarios. In TNAS, architectural knowledge accumulated in previous search processes is reused to warm up the architecture search for new tasks. However, existing TNAS methods still search in an extensive search space, necessitating the evaluation of numerous architectures. To overcome this challenge, this work proposes a novel transfer paradigm, i.e., design principle transfer. In this work, the linguistic description of various structural components' effects on architectural performance is termed design principles. They are learned from established architectures and then can be reused to reduce the search space by discarding unpromising architectures. Searching in the refined search space can boost both the search performance and efficiency for new NAS tasks. To this end, a large language model (LLM)-assisted design principle transfer (LAPT) framework is devised. In LAPT, LLM is applied to automatically reason the design principles from a set of given architectures, and then a principle adaptation method is applied to refine these principles progressively based on the new search results. Experimental results show that LAPT can beat the state-of-the-art TNAS methods on most tasks and achieve comparable performance on others. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.11330v1-abstract-full').style.display = 'none'; document.getElementById('2408.11330v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.10287">arXiv:2408.10287</a> <span> [<a href="https://arxiv.org/pdf/2408.10287">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Optics">physics.optics</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Recognizing Beam Profiles from Silicon Photonics Gratings using Transformer Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lim%2C+Y+D">Yu Dian Lim</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H+Y">Hong Yu Li</a>, <a href="/search/cs?searchtype=author&query=Goh%2C+S+C+K">Simon Chun Kiat Goh</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiangyu Wang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+P">Peng Zhao</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+C+S">Chuan Seng Tan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.10287v3-abstract-short" style="display: inline;"> Over the past decade, there has been extensive work in developing integrated silicon photonics (SiPh) gratings for the optical addressing of trapped ion qubits in the ion trap quantum computing community. However, when viewing beam profiles from infrared (IR) cameras, it is often difficult to determine the corresponding heights where the beam profiles are located. In this work, we developed transf… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10287v3-abstract-full').style.display = 'inline'; document.getElementById('2408.10287v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.10287v3-abstract-full" style="display: none;"> Over the past decade, there has been extensive work in developing integrated silicon photonics (SiPh) gratings for the optical addressing of trapped ion qubits in the ion trap quantum computing community. However, when viewing beam profiles from infrared (IR) cameras, it is often difficult to determine the corresponding heights where the beam profiles are located. In this work, we developed transformer models to recognize the corresponding height categories of beam profiles of light from SiPh gratings. The model is trained using two techniques: (1) input patches, and (2) input sequence. For model trained with input patches, the model achieved recognition accuracy of 0.938. Meanwhile, model trained with input sequence shows lower accuracy of 0.895. However, when repeating the model-training 150 cycles, model trained with input patches shows inconsistent accuracy ranges between 0.445 to 0.959, while model trained with input sequence exhibit higher accuracy values between 0.789 to 0.936. The obtained outcomes can be expanded to various applications, including auto-focusing of light beam and auto-adjustment of z-axis stage to acquire desired beam profiles. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10287v3-abstract-full').style.display = 'none'; document.getElementById('2408.10287v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.09647">arXiv:2408.09647</a> <span> [<a href="https://arxiv.org/pdf/2408.09647">pdf</a>, <a href="https://arxiv.org/format/2408.09647">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> C2P-CLIP: Injecting Category Common Prompt in CLIP to Enhance Generalization in Deepfake Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tan%2C+C">Chuangchuang Tan</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+R">Renshuai Tao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Huan Liu</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+G">Guanghua Gu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+B">Baoyuan Wu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yao Zhao</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+Y">Yunchao Wei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.09647v1-abstract-short" style="display: inline;"> This work focuses on AIGC detection to develop universal detectors capable of identifying various types of forgery images. Recent studies have found large pre-trained models, such as CLIP, are effective for generalizable deepfake detection along with linear classifiers. However, two critical issues remain unresolved: 1) understanding why CLIP features are effective on deepfake detection through a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.09647v1-abstract-full').style.display = 'inline'; document.getElementById('2408.09647v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.09647v1-abstract-full" style="display: none;"> This work focuses on AIGC detection to develop universal detectors capable of identifying various types of forgery images. Recent studies have found large pre-trained models, such as CLIP, are effective for generalizable deepfake detection along with linear classifiers. However, two critical issues remain unresolved: 1) understanding why CLIP features are effective on deepfake detection through a linear classifier; and 2) exploring the detection potential of CLIP. In this study, we delve into the underlying mechanisms of CLIP's detection capabilities by decoding its detection features into text and performing word frequency analysis. Our finding indicates that CLIP detects deepfakes by recognizing similar concepts (Fig. \ref{fig:fig1} a). Building on this insight, we introduce Category Common Prompt CLIP, called C2P-CLIP, which integrates the category common prompt into the text encoder to inject category-related concepts into the image encoder, thereby enhancing detection performance (Fig. \ref{fig:fig1} b). Our method achieves a 12.41\% improvement in detection accuracy compared to the original CLIP, without introducing additional parameters during testing. Comprehensive experiments conducted on two widely-used datasets, encompassing 20 generation models, validate the efficacy of the proposed method, demonstrating state-of-the-art performance. The code is available at \url{https://github.com/chuangchuangtan/C2P-CLIP-DeepfakeDetection} <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.09647v1-abstract-full').style.display = 'none'; document.getElementById('2408.09647v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 5 figures</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Tan%2C+C&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Tan%2C+C&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Tan%2C+C&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Tan%2C+C&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Tan%2C+C&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Tan%2C+C&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository