CINXE.COM

Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1&ndash;50 of 307 results for author: <span class="mathjax">Pham, T</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&amp;query=Pham%2C+T">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Pham, T"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Pham%2C+T&amp;terms-0-field=author&amp;size=50&amp;order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Pham, T"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Pham%2C+T&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Pham%2C+T&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Pham%2C+T&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Pham%2C+T&amp;start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Pham%2C+T&amp;start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Pham%2C+T&amp;start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Pham%2C+T&amp;start=250" class="pagination-link " aria-label="Page 6" aria-current="page">6 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Pham%2C+T&amp;start=300" class="pagination-link " aria-label="Page 7" aria-current="page">7 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.16318">arXiv:2411.16318</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.16318">pdf</a>, <a href="https://arxiv.org/format/2411.16318">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> One Diffusion to Generate Them All </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Le%2C+D+H">Duong H. Le</a>, <a href="/search/cs?searchtype=author&amp;query=Pham%2C+T">Tuan Pham</a>, <a href="/search/cs?searchtype=author&amp;query=Lee%2C+S">Sangho Lee</a>, <a href="/search/cs?searchtype=author&amp;query=Clark%2C+C">Christopher Clark</a>, <a href="/search/cs?searchtype=author&amp;query=Kembhavi%2C+A">Aniruddha Kembhavi</a>, <a href="/search/cs?searchtype=author&amp;query=Mandt%2C+S">Stephan Mandt</a>, <a href="/search/cs?searchtype=author&amp;query=Krishna%2C+R">Ranjay Krishna</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+J">Jiasen Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.16318v1-abstract-short" style="display: inline;"> We introduce OneDiffusion, a versatile, large-scale diffusion model that seamlessly supports bidirectional image synthesis and understanding across diverse tasks. It enables conditional generation from inputs such as text, depth, pose, layout, and semantic maps, while also handling tasks like image deblurring, upscaling, and reverse processes such as depth estimation and segmentation. Additionally&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16318v1-abstract-full').style.display = 'inline'; document.getElementById('2411.16318v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.16318v1-abstract-full" style="display: none;"> We introduce OneDiffusion, a versatile, large-scale diffusion model that seamlessly supports bidirectional image synthesis and understanding across diverse tasks. It enables conditional generation from inputs such as text, depth, pose, layout, and semantic maps, while also handling tasks like image deblurring, upscaling, and reverse processes such as depth estimation and segmentation. Additionally, OneDiffusion allows for multi-view generation, camera pose estimation, and instant personalization using sequential image inputs. Our model takes a straightforward yet effective approach by treating all tasks as frame sequences with varying noise scales during training, allowing any frame to act as a conditioning image at inference time. Our unified training framework removes the need for specialized architectures, supports scalable multi-task training, and adapts smoothly to any resolution, enhancing both generalization and scalability. Experimental results demonstrate competitive performance across tasks in both generation and prediction such as text-to-image, multiview generation, ID preservation, depth estimation and camera pose estimation despite relatively small training dataset. Our code and checkpoint are freely available at https://github.com/lehduong/OneDiffusion <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16318v1-abstract-full').style.display = 'none'; document.getElementById('2411.16318v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">two first authors contribute equally</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15413">arXiv:2411.15413</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.15413">pdf</a>, <a href="https://arxiv.org/format/2411.15413">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> FG-CXR: A Radiologist-Aligned Gaze Dataset for Enhancing Interpretability in Chest X-Ray Report Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Pham%2C+T+T">Trong Thang Pham</a>, <a href="/search/cs?searchtype=author&amp;query=Ho%2C+N">Ngoc-Vuong Ho</a>, <a href="/search/cs?searchtype=author&amp;query=Bui%2C+N">Nhat-Tan Bui</a>, <a href="/search/cs?searchtype=author&amp;query=Phan%2C+T">Thinh Phan</a>, <a href="/search/cs?searchtype=author&amp;query=Brijesh%2C+P">Patel Brijesh</a>, <a href="/search/cs?searchtype=author&amp;query=Adjeroh%2C+D">Donald Adjeroh</a>, <a href="/search/cs?searchtype=author&amp;query=Doretto%2C+G">Gianfranco Doretto</a>, <a href="/search/cs?searchtype=author&amp;query=Nguyen%2C+A">Anh Nguyen</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+C+C">Carol C. Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Nguyen%2C+H">Hien Nguyen</a>, <a href="/search/cs?searchtype=author&amp;query=Le%2C+N">Ngan Le</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15413v1-abstract-short" style="display: inline;"> Developing an interpretable system for generating reports in chest X-ray (CXR) analysis is becoming increasingly crucial in Computer-aided Diagnosis (CAD) systems, enabling radiologists to comprehend the decisions made by these systems. Despite the growth of diverse datasets and methods focusing on report generation, there remains a notable gap in how closely these models&#39; generated reports align&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15413v1-abstract-full').style.display = 'inline'; document.getElementById('2411.15413v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15413v1-abstract-full" style="display: none;"> Developing an interpretable system for generating reports in chest X-ray (CXR) analysis is becoming increasingly crucial in Computer-aided Diagnosis (CAD) systems, enabling radiologists to comprehend the decisions made by these systems. Despite the growth of diverse datasets and methods focusing on report generation, there remains a notable gap in how closely these models&#39; generated reports align with the interpretations of real radiologists. In this study, we tackle this challenge by initially introducing Fine-Grained CXR (FG-CXR) dataset, which provides fine-grained paired information between the captions generated by radiologists and the corresponding gaze attention heatmaps for each anatomy. Unlike existing datasets that include a raw sequence of gaze alongside a report, with significant misalignment between gaze location and report content, our FG-CXR dataset offers a more grained alignment between gaze attention and diagnosis transcript. Furthermore, our analysis reveals that simply applying black-box image captioning methods to generate reports cannot adequately explain which information in CXR is utilized and how long needs to attend to accurately generate reports. Consequently, we propose a novel explainable radiologist&#39;s attention generator network (Gen-XAI) that mimics the diagnosis process of radiologists, explicitly constraining its output to closely align with both radiologist&#39;s gaze attention and transcript. Finally, we perform extensive experiments to illustrate the effectiveness of our method. Our datasets and checkpoint is available at https://github.com/UARK-AICV/FG-CXR. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15413v1-abstract-full').style.display = 'none'; document.getElementById('2411.15413v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ACCV 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09955">arXiv:2411.09955</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.09955">pdf</a>, <a href="https://arxiv.org/format/2411.09955">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> Instruction-Guided Editing Controls for Images and Multimedia: A Survey in LLM era </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Nguyen%2C+T+T">Thanh Tam Nguyen</a>, <a href="/search/cs?searchtype=author&amp;query=Ren%2C+Z">Zhao Ren</a>, <a href="/search/cs?searchtype=author&amp;query=Pham%2C+T">Trinh Pham</a>, <a href="/search/cs?searchtype=author&amp;query=Huynh%2C+T+T">Thanh Trung Huynh</a>, <a href="/search/cs?searchtype=author&amp;query=Nguyen%2C+P+L">Phi Le Nguyen</a>, <a href="/search/cs?searchtype=author&amp;query=Yin%2C+H">Hongzhi Yin</a>, <a href="/search/cs?searchtype=author&amp;query=Nguyen%2C+Q+V+H">Quoc Viet Hung Nguyen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09955v2-abstract-short" style="display: inline;"> The rapid advancement of large language models (LLMs) and multimodal learning has transformed digital content creation and manipulation. Traditional visual editing tools require significant expertise, limiting accessibility. Recent strides in instruction-based editing have enabled intuitive interaction with visual content, using natural language as a bridge between user intent and complex editing&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09955v2-abstract-full').style.display = 'inline'; document.getElementById('2411.09955v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09955v2-abstract-full" style="display: none;"> The rapid advancement of large language models (LLMs) and multimodal learning has transformed digital content creation and manipulation. Traditional visual editing tools require significant expertise, limiting accessibility. Recent strides in instruction-based editing have enabled intuitive interaction with visual content, using natural language as a bridge between user intent and complex editing operations. This survey provides an overview of these techniques, focusing on how LLMs and multimodal models empower users to achieve precise visual modifications without deep technical knowledge. By synthesizing over 100 publications, we explore methods from generative adversarial networks to diffusion models, examining multimodal integration for fine-grained content control. We discuss practical applications across domains such as fashion, 3D scene manipulation, and video synthesis, highlighting increased accessibility and alignment with human intuition. Our survey compares existing literature, emphasizing LLM-empowered editing, and identifies key challenges to stimulate further research. We aim to democratize powerful visual editing across various industries, from entertainment to education. Interested readers are encouraged to access our repository at https://github.com/tamlhp/awesome-instruction-editing. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09955v2-abstract-full').style.display = 'none'; document.getElementById('2411.09955v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Fixed a serious error in author information</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09944">arXiv:2411.09944</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.09944">pdf</a>, <a href="https://arxiv.org/format/2411.09944">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> SlimLM: An Efficient Small Language Model for On-Device Document Assistance </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Pham%2C+T+M">Thang M. Pham</a>, <a href="/search/cs?searchtype=author&amp;query=Nguyen%2C+P+T">Phat T. Nguyen</a>, <a href="/search/cs?searchtype=author&amp;query=Yoon%2C+S">Seunghyun Yoon</a>, <a href="/search/cs?searchtype=author&amp;query=Lai%2C+V+D">Viet Dac Lai</a>, <a href="/search/cs?searchtype=author&amp;query=Dernoncourt%2C+F">Franck Dernoncourt</a>, <a href="/search/cs?searchtype=author&amp;query=Bui%2C+T">Trung Bui</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09944v3-abstract-short" style="display: inline;"> While small language models (SLMs) show promises for mobile deployment, their real-world performance and applications on smartphones remains underexplored. We present SlimLM, a series of SLMs optimized for document assistance tasks on mobile devices. Through extensive experiments on a Samsung Galaxy S24, we identify the optimal trade-offs between model size (ranging from 125M to 7B parameters), co&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09944v3-abstract-full').style.display = 'inline'; document.getElementById('2411.09944v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09944v3-abstract-full" style="display: none;"> While small language models (SLMs) show promises for mobile deployment, their real-world performance and applications on smartphones remains underexplored. We present SlimLM, a series of SLMs optimized for document assistance tasks on mobile devices. Through extensive experiments on a Samsung Galaxy S24, we identify the optimal trade-offs between model size (ranging from 125M to 7B parameters), context length, and inference time for efficient on-device processing. SlimLM is pre-trained on SlimPajama-627B and fine-tuned on DocAssist, our constructed dataset for summarization, question answering and suggestion tasks. Our smallest model demonstrates efficient performance on S24, while larger variants offer enhanced capabilities within mobile constraints. We evaluate SlimLM against existing SLMs, showing comparable or superior performance and offering a benchmark for future research in on-device language models. We also provide an Android application, offering practical insights into SLM deployment. Our findings provide valuable insights and illuminate the capabilities of running advanced language models on high-end smartphones, potentially reducing server costs and enhancing privacy through on-device processing. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09944v3-abstract-full').style.display = 'none'; document.getElementById('2411.09944v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06990">arXiv:2411.06990</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.06990">pdf</a>, <a href="https://arxiv.org/ps/2411.06990">ps</a>, <a href="https://arxiv.org/format/2411.06990">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Causal-discovery-based root-cause analysis and its application in time-series prediction error diagnosis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yokoyama%2C+H">Hiroshi Yokoyama</a>, <a href="/search/cs?searchtype=author&amp;query=Shingaki%2C+R">Ryusei Shingaki</a>, <a href="/search/cs?searchtype=author&amp;query=Nishino%2C+K">Kaneharu Nishino</a>, <a href="/search/cs?searchtype=author&amp;query=Shimizu%2C+S">Shohei Shimizu</a>, <a href="/search/cs?searchtype=author&amp;query=Pham%2C+T">Thong Pham</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06990v1-abstract-short" style="display: inline;"> Recent rapid advancements of machine learning have greatly enhanced the accuracy of prediction models, but most models remain &#34;black boxes&#34;, making prediction error diagnosis challenging, especially with outliers. This lack of transparency hinders trust and reliability in industrial applications. Heuristic attribution methods, while helpful, often fail to capture true causal relationships, leading&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06990v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06990v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06990v1-abstract-full" style="display: none;"> Recent rapid advancements of machine learning have greatly enhanced the accuracy of prediction models, but most models remain &#34;black boxes&#34;, making prediction error diagnosis challenging, especially with outliers. This lack of transparency hinders trust and reliability in industrial applications. Heuristic attribution methods, while helpful, often fail to capture true causal relationships, leading to inaccurate error attributions. Various root-cause analysis methods have been developed using Shapley values, yet they typically require predefined causal graphs, limiting their applicability for prediction errors in machine learning models. To address these limitations, we introduce the Causal-Discovery-based Root-Cause Analysis (CD-RCA) method that estimates causal relationships between the prediction error and the explanatory variables, without needing a pre-defined causal graph. By simulating synthetic error data, CD-RCA can identify variable contributions to outliers in prediction errors by Shapley values. Extensive simulations show CD-RCA outperforms current heuristic attribution methods, and a sensitivity analysis reveals new patterns where Shapley values may misattribute errors, paving the way for more accurate error attribution methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06990v1-abstract-full').style.display = 'none'; document.getElementById('2411.06990v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages with 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05780">arXiv:2411.05780</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.05780">pdf</a>, <a href="https://arxiv.org/format/2411.05780">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> GazeSearch: Radiology Findings Search Benchmark </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Pham%2C+T+T">Trong Thang Pham</a>, <a href="/search/cs?searchtype=author&amp;query=Nguyen%2C+T">Tien-Phat Nguyen</a>, <a href="/search/cs?searchtype=author&amp;query=Ikebe%2C+Y">Yuki Ikebe</a>, <a href="/search/cs?searchtype=author&amp;query=Awasthi%2C+A">Akash Awasthi</a>, <a href="/search/cs?searchtype=author&amp;query=Deng%2C+Z">Zhigang Deng</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+C+C">Carol C. Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Nguyen%2C+H">Hien Nguyen</a>, <a href="/search/cs?searchtype=author&amp;query=Le%2C+N">Ngan Le</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05780v1-abstract-short" style="display: inline;"> Medical eye-tracking data is an important information source for understanding how radiologists visually interpret medical images. This information not only improves the accuracy of deep learning models for X-ray analysis but also their interpretability, enhancing transparency in decision-making. However, the current eye-tracking data is dispersed, unprocessed, and ambiguous, making it difficult t&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05780v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05780v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05780v1-abstract-full" style="display: none;"> Medical eye-tracking data is an important information source for understanding how radiologists visually interpret medical images. This information not only improves the accuracy of deep learning models for X-ray analysis but also their interpretability, enhancing transparency in decision-making. However, the current eye-tracking data is dispersed, unprocessed, and ambiguous, making it difficult to derive meaningful insights. Therefore, there is a need to create a new dataset with more focus and purposeful eyetracking data, improving its utility for diagnostic applications. In this work, we propose a refinement method inspired by the target-present visual search challenge: there is a specific finding and fixations are guided to locate it. After refining the existing eye-tracking datasets, we transform them into a curated visual search dataset, called GazeSearch, specifically for radiology findings, where each fixation sequence is purposefully aligned to the task of locating a particular finding. Subsequently, we introduce a scan path prediction baseline, called ChestSearch, specifically tailored to GazeSearch. Finally, we employ the newly introduced GazeSearch as a benchmark to evaluate the performance of current state-of-the-art methods, offering a comprehensive assessment for visual search in the medical imaging domain. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05780v1-abstract-full').style.display = 'none'; document.getElementById('2411.05780v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Aceepted WACV 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01123">arXiv:2411.01123</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.01123">pdf</a>, <a href="https://arxiv.org/format/2411.01123">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> X-Drive: Cross-modality consistent multi-sensor data synthesis for driving scenarios </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Xie%2C+Y">Yichen Xie</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+C">Chenfeng Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Peng%2C+C">Chensheng Peng</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+S">Shuqi Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Ho%2C+N">Nhat Ho</a>, <a href="/search/cs?searchtype=author&amp;query=Pham%2C+A+T">Alexander T. Pham</a>, <a href="/search/cs?searchtype=author&amp;query=Ding%2C+M">Mingyu Ding</a>, <a href="/search/cs?searchtype=author&amp;query=Tomizuka%2C+M">Masayoshi Tomizuka</a>, <a href="/search/cs?searchtype=author&amp;query=Zhan%2C+W">Wei Zhan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01123v1-abstract-short" style="display: inline;"> Recent advancements have exploited diffusion models for the synthesis of either LiDAR point clouds or camera image data in driving scenarios. Despite their success in modeling single-modality data marginal distribution, there is an under-exploration in the mutual reliance between different modalities to describe complex driving scenes. To fill in this gap, we propose a novel framework, X-DRIVE, to&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01123v1-abstract-full').style.display = 'inline'; document.getElementById('2411.01123v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01123v1-abstract-full" style="display: none;"> Recent advancements have exploited diffusion models for the synthesis of either LiDAR point clouds or camera image data in driving scenarios. Despite their success in modeling single-modality data marginal distribution, there is an under-exploration in the mutual reliance between different modalities to describe complex driving scenes. To fill in this gap, we propose a novel framework, X-DRIVE, to model the joint distribution of point clouds and multi-view images via a dual-branch latent diffusion model architecture. Considering the distinct geometrical spaces of the two modalities, X-DRIVE conditions the synthesis of each modality on the corresponding local regions from the other modality, ensuring better alignment and realism. To further handle the spatial ambiguity during denoising, we design the cross-modality condition module based on epipolar lines to adaptively learn the cross-modality local correspondence. Besides, X-DRIVE allows for controllable generation through multi-level input conditions, including text, bounding box, image, and point clouds. Extensive results demonstrate the high-fidelity synthetic results of X-DRIVE for both point clouds and multi-view images, adhering to input conditions while ensuring reliable cross-modality consistency. Our code will be made publicly available at https://github.com/yichen928/X-Drive. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01123v1-abstract-full').style.display = 'none'; document.getElementById('2411.01123v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18572">arXiv:2410.18572</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.18572">pdf</a>, <a href="https://arxiv.org/format/2410.18572">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Taipan: Efficient and Expressive State Space Language Models with Selective Attention </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Van+Nguyen%2C+C">Chien Van Nguyen</a>, <a href="/search/cs?searchtype=author&amp;query=Nguyen%2C+H+H">Huy Huu Nguyen</a>, <a href="/search/cs?searchtype=author&amp;query=Pham%2C+T+M">Thang M. Pham</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+R">Ruiyi Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Deilamsalehy%2C+H">Hanieh Deilamsalehy</a>, <a href="/search/cs?searchtype=author&amp;query=Mathur%2C+P">Puneet Mathur</a>, <a href="/search/cs?searchtype=author&amp;query=Rossi%2C+R+A">Ryan A. Rossi</a>, <a href="/search/cs?searchtype=author&amp;query=Bui%2C+T">Trung Bui</a>, <a href="/search/cs?searchtype=author&amp;query=Lai%2C+V+D">Viet Dac Lai</a>, <a href="/search/cs?searchtype=author&amp;query=Dernoncourt%2C+F">Franck Dernoncourt</a>, <a href="/search/cs?searchtype=author&amp;query=Nguyen%2C+T+H">Thien Huu Nguyen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18572v1-abstract-short" style="display: inline;"> Efficient long-context language modeling remains a significant challenge in Natural Language Processing (NLP). While Transformers dominate language tasks, they struggle with long sequences due to quadratic computational complexity in training and linearly scaling memory costs during inference. Recent State Space Models (SSMs) such as Mamba offer alternatives with constant memory usage, but they un&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18572v1-abstract-full').style.display = 'inline'; document.getElementById('2410.18572v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18572v1-abstract-full" style="display: none;"> Efficient long-context language modeling remains a significant challenge in Natural Language Processing (NLP). While Transformers dominate language tasks, they struggle with long sequences due to quadratic computational complexity in training and linearly scaling memory costs during inference. Recent State Space Models (SSMs) such as Mamba offer alternatives with constant memory usage, but they underperform in tasks requiring extensive in-context retrieval. We introduce Taipan, a novel hybrid architecture that combines Mamba-2 with Selective Attention Layers (SALs). These SALs identify tokens requiring long-range interactions, remove less important features, and then augment their representations using the attention module. This approach balances Mamba&#39;s efficiency with Transformer-like performance in memory-intensive tasks. By constraining the attention budget, Taipan extends accurate predictions to context lengths of up to 1 million tokens while preserving computational efficiency. Our experiments demonstrate Taipan&#39;s superior performance across various scales and tasks, offering a promising solution for efficient long-context language modeling. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18572v1-abstract-full').style.display = 'none'; document.getElementById('2410.18572v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.17460">arXiv:2410.17460</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.17460">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Graph Neural Network-Accelerated Network-Reconfigured Optimal Power Flow </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Pham%2C+T">Thuan Pham</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+X">Xingpeng Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.17460v1-abstract-short" style="display: inline;"> Optimal power flow (OPF) has been used for real-time grid operations. Prior efforts demonstrated that utilizing flexibility from dynamic topologies will improve grid efficiency. However, this will convert the linear OPF into a mixed-integer linear programming network-reconfigured OPF (NR-OPF) problem, substantially increasing the computing time. Thus, a machine learning (ML)-based approach, partic&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17460v1-abstract-full').style.display = 'inline'; document.getElementById('2410.17460v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.17460v1-abstract-full" style="display: none;"> Optimal power flow (OPF) has been used for real-time grid operations. Prior efforts demonstrated that utilizing flexibility from dynamic topologies will improve grid efficiency. However, this will convert the linear OPF into a mixed-integer linear programming network-reconfigured OPF (NR-OPF) problem, substantially increasing the computing time. Thus, a machine learning (ML)-based approach, particularly utilizing graph neural network (GNN), is proposed to accelerate the solution process. The GNN model is trained offline to predict the best topology before entering the optimization stage. In addition, this paper proposes an offline pre-ML filter layer to reduce GNN model size and training time while improving its accuracy. A fast online post-ML selection layer is also proposed to analyze GNN predictions and then select a subset of predicted NR solutions with high confidence. Case studies have demonstrated superior performance of the proposed GNN-accelerated NR-OPF method augmented with the proposed pre-ML and post-ML layers. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17460v1-abstract-full').style.display = 'none'; document.getElementById('2410.17460v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.09190">arXiv:2410.09190</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.09190">pdf</a>, <a href="https://arxiv.org/format/2410.09190">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Time to Retrain? Detecting Concept Drifts in Machine Learning Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Pham%2C+T+M+T">Tri Minh Triet Pham</a>, <a href="/search/cs?searchtype=author&amp;query=Premkumar%2C+K">Karthikeyan Premkumar</a>, <a href="/search/cs?searchtype=author&amp;query=Naili%2C+M">Mohamed Naili</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+J">Jinqiu Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.09190v1-abstract-short" style="display: inline;"> With the boom of machine learning (ML) techniques, software practitioners build ML systems to process the massive volume of streaming data for diverse software engineering tasks such as failure prediction in AIOps. Trained using historical data, such ML models encounter performance degradation caused by concept drift, i.e., data and inter-relationship (concept) changes between training and product&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09190v1-abstract-full').style.display = 'inline'; document.getElementById('2410.09190v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.09190v1-abstract-full" style="display: none;"> With the boom of machine learning (ML) techniques, software practitioners build ML systems to process the massive volume of streaming data for diverse software engineering tasks such as failure prediction in AIOps. Trained using historical data, such ML models encounter performance degradation caused by concept drift, i.e., data and inter-relationship (concept) changes between training and production. It is essential to use concept rift detection to monitor the deployed ML models and re-train the ML models when needed. In this work, we explore applying state-of-the-art (SOTA) concept drift detection techniques on synthetic and real-world datasets in an industrial setting. Such an industrial setting requires minimal manual effort in labeling and maximal generality in ML model architecture. We find that current SOTA semi-supervised methods not only require significant labeling effort but also only work for certain types of ML models. To overcome such limitations, we propose a novel model-agnostic technique (CDSeer) for detecting concept drift. Our evaluation shows that CDSeer has better precision and recall compared to the state-of-the-art while requiring significantly less manual labeling. We demonstrate the effectiveness of CDSeer at concept drift detection by evaluating it on eight datasets from different domains and use cases. Results from internal deployment of CDSeer on an industrial proprietary dataset show a 57.1% improvement in precision while using 99% fewer labels compared to the SOTA concept drift detection method. The performance is also comparable to the supervised concept drift detection method, which requires 100% of the data to be labeled. The improved performance and ease of adoption of CDSeer are valuable in making ML systems more reliable. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09190v1-abstract-full').style.display = 'none'; document.getElementById('2410.09190v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.08117">arXiv:2410.08117</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.08117">pdf</a>, <a href="https://arxiv.org/format/2410.08117">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> On Barycenter Computation: Semi-Unbalanced Optimal Transport-based Method on Gaussians </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Nguyen%2C+N">Ngoc-Hai Nguyen</a>, <a href="/search/cs?searchtype=author&amp;query=Le%2C+D">Dung Le</a>, <a href="/search/cs?searchtype=author&amp;query=Nguyen%2C+H">Hoang-Phi Nguyen</a>, <a href="/search/cs?searchtype=author&amp;query=Pham%2C+T">Tung Pham</a>, <a href="/search/cs?searchtype=author&amp;query=Ho%2C+N">Nhat Ho</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.08117v1-abstract-short" style="display: inline;"> We explore a robust version of the barycenter problem among $n$ centered Gaussian probability measures, termed Semi-Unbalanced Optimal Transport (SUOT)-based Barycenter, wherein the barycenter remains fixed while the others are relaxed using Kullback-Leibler divergence. We develop optimization algorithms on Bures-Wasserstein manifold, named the Exact Geodesic Gradient Descent and Hybrid Gradient D&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08117v1-abstract-full').style.display = 'inline'; document.getElementById('2410.08117v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.08117v1-abstract-full" style="display: none;"> We explore a robust version of the barycenter problem among $n$ centered Gaussian probability measures, termed Semi-Unbalanced Optimal Transport (SUOT)-based Barycenter, wherein the barycenter remains fixed while the others are relaxed using Kullback-Leibler divergence. We develop optimization algorithms on Bures-Wasserstein manifold, named the Exact Geodesic Gradient Descent and Hybrid Gradient Descent algorithms. While the Exact Geodesic Gradient Descent method is based on computing the exact closed form of the first-order derivative of the objective function of the barycenter along a geodesic on the Bures manifold, the Hybrid Gradient Descent method utilizes optimizer components when solving the SUOT problem to replace outlier measures before applying the Riemannian Gradient Descent. We establish the theoretical convergence guarantees for both methods and demonstrate that the Exact Geodesic Gradient Descent algorithm attains a dimension-free convergence rate. Finally, we conduct experiments to compare the normal Wasserstein Barycenter with ours and perform an ablation study. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08117v1-abstract-full').style.display = 'none'; document.getElementById('2410.08117v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Ngoc-Hai Nguyen and Dung Le contributed equally to this work. 44 pages, 5 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 62-08 <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> G.1.6; F.2.1 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.06132">arXiv:2410.06132</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.06132">pdf</a>, <a href="https://arxiv.org/ps/2410.06132">ps</a>, <a href="https://arxiv.org/format/2410.06132">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Combinatorics">math.CO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Discrete Mathematics">cs.DM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Probability">math.PR</span> </div> </div> <p class="title is-5 mathjax"> Spread blow-up lemma with an application to perturbed random graphs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Nenadov%2C+R">Rajko Nenadov</a>, <a href="/search/cs?searchtype=author&amp;query=Pham%2C+H+T">Huy Tuan Pham</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.06132v1-abstract-short" style="display: inline;"> Combining ideas of Pham, Sah, Sawhney, and Simkin on spread perfect matchings in super-regular bipartite graphs with an algorithmic blow-up lemma, we prove a spread version of the blow-up lemma. Intuitively, this means that there exists a probability measure over copies of a desired spanning graph $H$ in a given system of super-regular pairs which does not heavily pin down any subset of vertices.&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06132v1-abstract-full').style.display = 'inline'; document.getElementById('2410.06132v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.06132v1-abstract-full" style="display: none;"> Combining ideas of Pham, Sah, Sawhney, and Simkin on spread perfect matchings in super-regular bipartite graphs with an algorithmic blow-up lemma, we prove a spread version of the blow-up lemma. Intuitively, this means that there exists a probability measure over copies of a desired spanning graph $H$ in a given system of super-regular pairs which does not heavily pin down any subset of vertices. This allows one to complement the use of the blow-up lemma with the recently resolved Kahn-Kalai conjecture. As an application, we prove an approximate version of a conjecture of B枚ttcher, Parczyk, Sgueglia, and Skokan on the threshold for appearance of powers of Hamilton cycles in perturbed random graphs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06132v1-abstract-full').style.display = 'none'; document.getElementById('2410.06132v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.03292">arXiv:2410.03292</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.03292">pdf</a>, <a href="https://arxiv.org/format/2410.03292">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Demystifying the Token Dynamics of Deep Selective State Space Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Vo%2C+T+N">Thieu N Vo</a>, <a href="/search/cs?searchtype=author&amp;query=Pham%2C+T+D">Tung D. Pham</a>, <a href="/search/cs?searchtype=author&amp;query=Tong%2C+X+T">Xin T. Tong</a>, <a href="/search/cs?searchtype=author&amp;query=Nguyen%2C+T+M">Tan Minh Nguyen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.03292v1-abstract-short" style="display: inline;"> Selective state space models (SSM), such as Mamba, have gained prominence for their effectiveness in modeling sequential data. Despite their outstanding empirical performance, a comprehensive theoretical understanding of deep selective SSM remains elusive, hindering their further development and adoption for applications that need high fidelity. In this paper, we investigate the dynamical properti&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03292v1-abstract-full').style.display = 'inline'; document.getElementById('2410.03292v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.03292v1-abstract-full" style="display: none;"> Selective state space models (SSM), such as Mamba, have gained prominence for their effectiveness in modeling sequential data. Despite their outstanding empirical performance, a comprehensive theoretical understanding of deep selective SSM remains elusive, hindering their further development and adoption for applications that need high fidelity. In this paper, we investigate the dynamical properties of tokens in a pre-trained Mamba model. In particular, we derive the dynamical system governing the continuous-time limit of the Mamba model and characterize the asymptotic behavior of its solutions. In the one-dimensional case, we prove that only one of the following two scenarios happens: either all tokens converge to zero, or all tokens diverge to infinity. We provide criteria based on model parameters to determine when each scenario occurs. For the convergent scenario, we empirically verify that this scenario negatively impacts the model&#39;s performance. For the divergent scenario, we prove that different tokens will diverge to infinity at different rates, thereby contributing unequally to the updates during model training. Based on these investigations, we propose two refinements for the model: excluding the convergent scenario and reordering tokens based on their importance scores, both aimed at improving practical performance. Our experimental results validate these refinements, offering insights into enhancing Mamba&#39;s effectiveness in real-world applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03292v1-abstract-full').style.display = 'none'; document.getElementById('2410.03292v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.02130">arXiv:2410.02130</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.02130">pdf</a>, <a href="https://arxiv.org/format/2410.02130">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> MDSGen: Fast and Efficient Masked Diffusion Temporal-Aware Transformers for Open-Domain Sound Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Pham%2C+T+X">Trung X. Pham</a>, <a href="/search/cs?searchtype=author&amp;query=Ton%2C+T">Tri Ton</a>, <a href="/search/cs?searchtype=author&amp;query=Yoo%2C+C+D">Chang D. Yoo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.02130v1-abstract-short" style="display: inline;"> We introduce MDSGen, a novel framework for vision-guided open-domain sound generation optimized for model parameter size, memory consumption, and inference speed. This framework incorporates two key innovations: (1) a redundant video feature removal module that filters out unnecessary visual information, and (2) a temporal-aware masking strategy that leverages temporal context for enhanced audio g&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02130v1-abstract-full').style.display = 'inline'; document.getElementById('2410.02130v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.02130v1-abstract-full" style="display: none;"> We introduce MDSGen, a novel framework for vision-guided open-domain sound generation optimized for model parameter size, memory consumption, and inference speed. This framework incorporates two key innovations: (1) a redundant video feature removal module that filters out unnecessary visual information, and (2) a temporal-aware masking strategy that leverages temporal context for enhanced audio generation accuracy. In contrast to existing resource-heavy Unet-based models, MDSGen employs denoising masked diffusion transformers, facilitating efficient generation without reliance on pre-trained diffusion models. Evaluated on the benchmark VGGSound dataset, our smallest model (5M parameters) achieves 97.9% alignment accuracy, using 172x fewer parameters, 371% less memory, and offering 36x faster inference than the current 860M-parameter state-of-the-art model (93.9% accuracy). The larger model (131M parameters) reaches nearly 99% accuracy while requiring 6.5x fewer parameters. These results highlight the scalability and effectiveness of our approach. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02130v1-abstract-full').style.display = 'none'; document.getElementById('2410.02130v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">21 pages, 16 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.14435">arXiv:2409.14435</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.14435">pdf</a>, <a href="https://arxiv.org/format/2409.14435">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Adaptive Compensation for Robotic Joint Failures Using Partially Observable Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Pham%2C+T">Tan-Hanh Pham</a>, <a href="/search/cs?searchtype=author&amp;query=Aikins%2C+G">Godwyll Aikins</a>, <a href="/search/cs?searchtype=author&amp;query=Truong%2C+T">Tri Truong</a>, <a href="/search/cs?searchtype=author&amp;query=Nguyen%2C+K">Kim-Doang Nguyen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.14435v1-abstract-short" style="display: inline;"> Robotic manipulators are widely used in various industries for complex and repetitive tasks. However, they remain vulnerable to unexpected hardware failures. In this study, we address the challenge of enabling a robotic manipulator to complete tasks despite joint malfunctions. Specifically, we develop a reinforcement learning (RL) framework to adaptively compensate for a non-functional joint durin&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.14435v1-abstract-full').style.display = 'inline'; document.getElementById('2409.14435v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.14435v1-abstract-full" style="display: none;"> Robotic manipulators are widely used in various industries for complex and repetitive tasks. However, they remain vulnerable to unexpected hardware failures. In this study, we address the challenge of enabling a robotic manipulator to complete tasks despite joint malfunctions. Specifically, we develop a reinforcement learning (RL) framework to adaptively compensate for a non-functional joint during task execution. Our experimental platform is the Franka robot with 7 degrees of freedom (DOFs). We formulate the problem as a partially observable Markov decision process (POMDP), where the robot is trained under various joint failure conditions and tested in both seen and unseen scenarios. We consider scenarios where a joint is permanently broken and where it functions intermittently. Additionally, we demonstrate the effectiveness of our approach by comparing it with traditional inverse kinematics-based control methods. The results show that the RL algorithm enables the robot to successfully complete tasks even with joint failures, achieving a high success rate with an average rate of 93.6%. This showcases its robustness and adaptability. Our findings highlight the potential of RL to enhance the resilience and reliability of robotic systems, making them better suited for unpredictable environments. All related codes and models are published online. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.14435v1-abstract-full').style.display = 'none'; document.getElementById('2409.14435v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.14074">arXiv:2409.14074</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.14074">pdf</a>, <a href="https://arxiv.org/format/2409.14074">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> MultiMed: Multilingual Medical Speech Recognition via Attention Encoder Decoder </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Le-Duc%2C+K">Khai Le-Duc</a>, <a href="/search/cs?searchtype=author&amp;query=Phan%2C+P">Phuc Phan</a>, <a href="/search/cs?searchtype=author&amp;query=Pham%2C+T">Tan-Hanh Pham</a>, <a href="/search/cs?searchtype=author&amp;query=Tat%2C+B+P">Bach Phan Tat</a>, <a href="/search/cs?searchtype=author&amp;query=Ngo%2C+M">Minh-Huong Ngo</a>, <a href="/search/cs?searchtype=author&amp;query=Hy%2C+T">Truong-Son Hy</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.14074v1-abstract-short" style="display: inline;"> Multilingual automatic speech recognition (ASR) in the medical domain serves as a foundational task for various downstream applications such as speech translation, spoken language understanding, and voice-activated assistants. This technology enhances patient care by enabling efficient communication across language barriers, alleviating specialized workforce shortages, and facilitating improved di&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.14074v1-abstract-full').style.display = 'inline'; document.getElementById('2409.14074v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.14074v1-abstract-full" style="display: none;"> Multilingual automatic speech recognition (ASR) in the medical domain serves as a foundational task for various downstream applications such as speech translation, spoken language understanding, and voice-activated assistants. This technology enhances patient care by enabling efficient communication across language barriers, alleviating specialized workforce shortages, and facilitating improved diagnosis and treatment, particularly during pandemics. In this work, we introduce MultiMed, a collection of small-to-large end-to-end ASR models for the medical domain, spanning five languages: Vietnamese, English, German, French, and Mandarin Chinese, together with the corresponding real-world ASR dataset. To our best knowledge, MultiMed stands as the largest and the first multilingual medical ASR dataset, in terms of total duration, number of speakers, diversity of diseases, recording conditions, speaker roles, unique medical terms, accents, and ICD-10 codes. Secondly, we establish the empirical baselines, present the first reproducible study of multilinguality in medical ASR, conduct a layer-wise ablation study for end-to-end ASR training, and provide the first linguistic analysis for multilingual medical ASR. All code, data, and models are available online https://github.com/leduckhai/MultiMed/tree/master/MultiMed <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.14074v1-abstract-full').style.display = 'none'; document.getElementById('2409.14074v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Preprint</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.10824">arXiv:2409.10824</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.10824">pdf</a>, <a href="https://arxiv.org/format/2409.10824">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Evaluating and Improving the Robustness of LiDAR-based Localization and Mapping </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yang%2C+B">Bo Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Pham%2C+T+M+T">Tri Minh Triet Pham</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+J">Jinqiu Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.10824v1-abstract-short" style="display: inline;"> LiDAR is one of the most commonly adopted sensors for simultaneous localization and mapping (SLAM) and map-based global localization. SLAM and map-based localization are crucial for the independent operation of autonomous systems, especially when external signals such as GNSS are unavailable or unreliable. While state-of-the-art (SOTA) LiDAR SLAM systems could achieve 0.5% (i.e., 0.5m per 100m) of&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.10824v1-abstract-full').style.display = 'inline'; document.getElementById('2409.10824v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.10824v1-abstract-full" style="display: none;"> LiDAR is one of the most commonly adopted sensors for simultaneous localization and mapping (SLAM) and map-based global localization. SLAM and map-based localization are crucial for the independent operation of autonomous systems, especially when external signals such as GNSS are unavailable or unreliable. While state-of-the-art (SOTA) LiDAR SLAM systems could achieve 0.5% (i.e., 0.5m per 100m) of errors and map-based localization could achieve centimeter-level global localization, it is still unclear how robust they are under various common LiDAR data corruptions. In this work, we extensively evaluated five SOTA LiDAR-based localization systems under 18 common scene-level LiDAR point cloud data (PCD) corruptions. We found that the robustness of LiDAR-based localization varies significantly depending on the category. For SLAM, hand-crafted methods are in general robust against most types of corruption, while being extremely vulnerable (up to +80% errors) to a specific corruption. Learning-based methods are vulnerable to most types of corruptions. For map-based global localization, we found that the SOTA is resistant to all applied corruptions. Finally, we found that simple Bilateral Filter denoising effectively eliminates noise-based corruption but is not helpful in density-based corruption. Re-training is more effective in defending learning-based SLAM against all types of corruption. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.10824v1-abstract-full').style.display = 'none'; document.getElementById('2409.10824v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.07652">arXiv:2409.07652</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.07652">pdf</a>, <a href="https://arxiv.org/format/2409.07652">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Statistics Theory">math.ST</span> </div> </div> <p class="title is-5 mathjax"> Gaussian Process Upper Confidence Bounds in Distributed Point Target Tracking over Wireless Sensor Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liu%2C+X">Xingchi Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Mihaylova%2C+L">Lyudmila Mihaylova</a>, <a href="/search/cs?searchtype=author&amp;query=George%2C+J">Jemin George</a>, <a href="/search/cs?searchtype=author&amp;query=Pham%2C+T">Tien Pham</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.07652v1-abstract-short" style="display: inline;"> Uncertainty quantification plays a key role in the development of autonomous systems, decision-making, and tracking over wireless sensor networks (WSNs). However, there is a need of providing uncertainty confidence bounds, especially for distributed machine learning-based tracking, dealing with different volumes of data collected by sensors. This paper aims to fill in this gap and proposes a distr&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.07652v1-abstract-full').style.display = 'inline'; document.getElementById('2409.07652v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.07652v1-abstract-full" style="display: none;"> Uncertainty quantification plays a key role in the development of autonomous systems, decision-making, and tracking over wireless sensor networks (WSNs). However, there is a need of providing uncertainty confidence bounds, especially for distributed machine learning-based tracking, dealing with different volumes of data collected by sensors. This paper aims to fill in this gap and proposes a distributed Gaussian process (DGP) approach for point target tracking and derives upper confidence bounds (UCBs) of the state estimates. A unique contribution of this paper includes the derived theoretical guarantees on the proposed approach and its maximum accuracy for tracking with and without clutter measurements. Particularly, the developed approaches with uncertainty bounds are generic and can provide trustworthy solutions with an increased level of reliability. A novel hybrid Bayesian filtering method is proposed to enhance the DGP approach by adopting a Poisson measurement likelihood model. The proposed approaches are validated over a WSN case study, where sensors have limited sensing ranges. Numerical results demonstrate the tracking accuracy and robustness of the proposed approaches. The derived UCBs constitute a tool for trustworthiness evaluation of DGP approaches. The simulation results reveal that the proposed UCBs successfully encompass the true target states with 88% and 42% higher probability in X and Y coordinates, respectively, when compared to the confidence interval-based method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.07652v1-abstract-full').style.display = 'none'; document.getElementById('2409.07652v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.13850">arXiv:2408.13850</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2408.13850">pdf</a>, <a href="https://arxiv.org/format/2408.13850">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Condensed Sample-Guided Model Inversion for Knowledge Distillation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Binici%2C+K">Kuluhan Binici</a>, <a href="/search/cs?searchtype=author&amp;query=Aggarwal%2C+S">Shivam Aggarwal</a>, <a href="/search/cs?searchtype=author&amp;query=Acar%2C+C">Cihan Acar</a>, <a href="/search/cs?searchtype=author&amp;query=Pham%2C+N+T">Nam Trung Pham</a>, <a href="/search/cs?searchtype=author&amp;query=Leman%2C+K">Karianto Leman</a>, <a href="/search/cs?searchtype=author&amp;query=Lee%2C+G+H">Gim Hee Lee</a>, <a href="/search/cs?searchtype=author&amp;query=Mitra%2C+T">Tulika Mitra</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.13850v1-abstract-short" style="display: inline;"> Knowledge distillation (KD) is a key element in neural network compression that allows knowledge transfer from a pre-trained teacher model to a more compact student model. KD relies on access to the training dataset, which may not always be fully available due to privacy concerns or logistical issues related to the size of the data. To address this, &#34;data-free&#34; KD methods use synthetic data, gener&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.13850v1-abstract-full').style.display = 'inline'; document.getElementById('2408.13850v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.13850v1-abstract-full" style="display: none;"> Knowledge distillation (KD) is a key element in neural network compression that allows knowledge transfer from a pre-trained teacher model to a more compact student model. KD relies on access to the training dataset, which may not always be fully available due to privacy concerns or logistical issues related to the size of the data. To address this, &#34;data-free&#34; KD methods use synthetic data, generated through model inversion, to mimic the target data distribution. However, conventional model inversion methods are not designed to utilize supplementary information from the target dataset, and thus, cannot leverage it to improve performance, even when it is available. In this paper, we consider condensed samples, as a form of supplementary information, and introduce a method for using them to better approximate the target data distribution, thereby enhancing the KD performance. Our approach is versatile, evidenced by improvements of up to 11.4% in KD accuracy across various datasets and model inversion-based methods. Importantly, it remains effective even when using as few as one condensed sample per class, and can also enhance performance in few-shot scenarios where only limited real data samples are available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.13850v1-abstract-full').style.display = 'none'; document.getElementById('2408.13850v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.13686">arXiv:2408.13686</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2408.13686">pdf</a>, <a href="https://arxiv.org/format/2408.13686">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> Perception-Guided Fuzzing for Simulated Scenario-Based Testing of Autonomous Driving Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Pham%2C+T+M+T">Tri Minh Triet Pham</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+B">Bo Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+J">Jinqiu Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.13686v1-abstract-short" style="display: inline;"> Autonomous Driving Systems (ADS) have made huge progress and started on-road testing or even commercializing trials. ADS are complex and difficult to test: they receive input data from multiple sensors and make decisions using a combination of multiple deep neural network models and code logic. The safety of ADS is of utmost importance as their misbehavior can result in costly catastrophes, includ&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.13686v1-abstract-full').style.display = 'inline'; document.getElementById('2408.13686v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.13686v1-abstract-full" style="display: none;"> Autonomous Driving Systems (ADS) have made huge progress and started on-road testing or even commercializing trials. ADS are complex and difficult to test: they receive input data from multiple sensors and make decisions using a combination of multiple deep neural network models and code logic. The safety of ADS is of utmost importance as their misbehavior can result in costly catastrophes, including the loss of human life. In this work, we propose SimsV, which performs system-level testing on multi-module ADS. SimsV targets perception failures of ADS and further assesses the impact of perception failure on the system as a whole. SimsV leverages a high-fidelity simulator for test input and oracle generation by continuously applying predefined mutation operators. In addition, SimsV leverages various metrics to guide the testing process. We implemented a prototype SimsV for testing a commercial-grade Level 4 ADS (i.e., Apollo) using a popular open-source driving platform simulator. Our evaluation shows that SimsV is capable of finding weaknesses in the perception of Apollo. Furthermore, we show that by exploiting such weakness, SimsV finds severe problems in Apollo, including collisions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.13686v1-abstract-full').style.display = 'none'; document.getElementById('2408.13686v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.13653">arXiv:2408.13653</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2408.13653">pdf</a>, <a href="https://arxiv.org/format/2408.13653">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> Evaluating the Robustness of LiDAR-based 3D Obstacles Detection and Its Impacts on Autonomous Driving Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Pham%2C+T+M+T">Tri Minh Triet Pham</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+B">Bo Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+J">Jinqiu Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.13653v1-abstract-short" style="display: inline;"> Autonomous driving systems (ADSs) require real-time input from multiple sensors to make time-sensitive decisions using deep neural networks. This makes the correctness of these decisions crucial to ADSs&#39; adoption as errors can cause significant loss. Sensors such as LiDAR are sensitive to environmental changes and built-in inaccuracies and may fluctuate between frames. While there has been extensi&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.13653v1-abstract-full').style.display = 'inline'; document.getElementById('2408.13653v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.13653v1-abstract-full" style="display: none;"> Autonomous driving systems (ADSs) require real-time input from multiple sensors to make time-sensitive decisions using deep neural networks. This makes the correctness of these decisions crucial to ADSs&#39; adoption as errors can cause significant loss. Sensors such as LiDAR are sensitive to environmental changes and built-in inaccuracies and may fluctuate between frames. While there has been extensive work to test ADSs, it remains unclear whether current ADSs are robust against very subtle changes in LiDAR point cloud data. In this work, we study the impact of the built-in inaccuracies in LiDAR sensors on LiDAR-3D obstacle detection models to provide insight into how they can impact obstacle detection (i.e., robustness) and by extension trajectory prediction (i.e., how the robustness of obstacle detection would impact ADSs). We propose a framework SORBET, that applies subtle perturbations to LiDAR data, evaluates the robustness of LiDAR-3D obstacle detection, and assesses the impacts on the trajectory prediction module and ADSs. We applied SORBET to evaluate the robustness of five classic LiDAR-3D obstacle detection models, including one from an industry-grade Level 4 ADS (Baidu&#39;s Apollo). Furthermore, we studied how changes in the obstacle detection results would negatively impact trajectory prediction in a cascading fashion. Our evaluation highlights the importance of testing the robustness of LiDAR-3D obstacle detection models against subtle perturbations. We find that even very subtle changes in point cloud data (i.e., removing two points) may introduce a non-trivial decrease in the detection performance. Furthermore, such a negative impact will further propagate to other modules, and endanger the safety of ADSs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.13653v1-abstract-full').style.display = 'none'; document.getElementById('2408.13653v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.12480">arXiv:2408.12480</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2408.12480">pdf</a>, <a href="https://arxiv.org/format/2408.12480">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Vintern-1B: An Efficient Multimodal Large Language Model for Vietnamese </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Doan%2C+K+T">Khang T. Doan</a>, <a href="/search/cs?searchtype=author&amp;query=Huynh%2C+B+G">Bao G. Huynh</a>, <a href="/search/cs?searchtype=author&amp;query=Hoang%2C+D+T">Dung T. Hoang</a>, <a href="/search/cs?searchtype=author&amp;query=Pham%2C+T+D">Thuc D. Pham</a>, <a href="/search/cs?searchtype=author&amp;query=Pham%2C+N+H">Nhat H. Pham</a>, <a href="/search/cs?searchtype=author&amp;query=Nguyen%2C+Q+T+M">Quan T. M. Nguyen</a>, <a href="/search/cs?searchtype=author&amp;query=Vo%2C+B+Q">Bang Q. Vo</a>, <a href="/search/cs?searchtype=author&amp;query=Hoang%2C+S+N">Suong N. Hoang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.12480v2-abstract-short" style="display: inline;"> In this report, we introduce Vintern-1B, a reliable 1-billion-parameters multimodal large language model (MLLM) for Vietnamese language tasks. By integrating the Qwen2-0.5B-Instruct language model with the InternViT-300M-448px visual model, Vintern-1B is optimized for a range of applications, including optical character recognition (OCR), document extraction, and general question-answering in Viet&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.12480v2-abstract-full').style.display = 'inline'; document.getElementById('2408.12480v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.12480v2-abstract-full" style="display: none;"> In this report, we introduce Vintern-1B, a reliable 1-billion-parameters multimodal large language model (MLLM) for Vietnamese language tasks. By integrating the Qwen2-0.5B-Instruct language model with the InternViT-300M-448px visual model, Vintern-1B is optimized for a range of applications, including optical character recognition (OCR), document extraction, and general question-answering in Vietnamese context. The model is fine-tuned on an extensive dataset of over 3 million image-question-answer pairs, achieving robust performance and reliable results across multiple Vietnamese language benchmarks like OpenViVQA and ViTextVQA. Vintern-1B is small enough to fit into various on-device applications easily. Additionally, we have open-sourced several Vietnamese vision question answering (VQA) datasets for text and diagrams, created with Gemini 1.5 Flash. Our models are available at: https://huggingface.co/5CD-AI/Vintern-1B-v2. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.12480v2-abstract-full').style.display = 'none'; document.getElementById('2408.12480v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.08651">arXiv:2408.08651</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2408.08651">pdf</a>, <a href="https://arxiv.org/format/2408.08651">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Reasoning Beyond Bias: A Study on Counterfactual Prompting and Chain of Thought Reasoning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Moore%2C+K">Kyle Moore</a>, <a href="/search/cs?searchtype=author&amp;query=Roberts%2C+J">Jesse Roberts</a>, <a href="/search/cs?searchtype=author&amp;query=Pham%2C+T">Thao Pham</a>, <a href="/search/cs?searchtype=author&amp;query=Fisher%2C+D">Douglas Fisher</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.08651v2-abstract-short" style="display: inline;"> Language models are known to absorb biases from their training data, leading to predictions driven by statistical regularities rather than semantic relevance. We investigate the impact of these biases on answer choice preferences in the Massive Multi-Task Language Understanding (MMLU) task. Our findings reveal that differences in learned regularities across answer options are predictive of model p&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.08651v2-abstract-full').style.display = 'inline'; document.getElementById('2408.08651v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.08651v2-abstract-full" style="display: none;"> Language models are known to absorb biases from their training data, leading to predictions driven by statistical regularities rather than semantic relevance. We investigate the impact of these biases on answer choice preferences in the Massive Multi-Task Language Understanding (MMLU) task. Our findings reveal that differences in learned regularities across answer options are predictive of model preferences and mirror human test-taking strategies. To address this issue, we introduce two novel methods: Counterfactual Prompting with Chain of Thought (CoT) and Counterfactual Prompting with Agnostically Primed CoT (APriCoT). We demonstrate that while Counterfactual Prompting with CoT alone is insufficient to mitigate bias, our novel Primed Counterfactual Prompting with CoT approach effectively reduces the influence of base-rate probabilities while improving overall accuracy. Our results suggest that mitigating bias requires a &#34;System-2&#34; like process and that CoT reasoning is susceptible to confirmation bias under some prompting methodologies. Our contributions offer practical solutions for developing more robust and fair language models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.08651v2-abstract-full').style.display = 'none'; document.getElementById('2408.08651v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.06223">arXiv:2408.06223</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2408.06223">pdf</a>, <a href="https://arxiv.org/format/2408.06223">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> On Effects of Steering Latent Representation for Large Language Model Unlearning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Huu-Tien%2C+D">Dang Huu-Tien</a>, <a href="/search/cs?searchtype=author&amp;query=Pham%2C+T">Trung-Tin Pham</a>, <a href="/search/cs?searchtype=author&amp;query=Thanh-Tung%2C+H">Hoang Thanh-Tung</a>, <a href="/search/cs?searchtype=author&amp;query=Inoue%2C+N">Naoya Inoue</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.06223v1-abstract-short" style="display: inline;"> Representation Misdirection for Unlearning (RMU), which steers model representation in the intermediate layer to a target random representation, is an effective method for large language model (LLM) unlearning. Despite its high performance, the underlying cause and explanation remain underexplored. In this paper, we first theoretically demonstrate that steering forget representations in the interm&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.06223v1-abstract-full').style.display = 'inline'; document.getElementById('2408.06223v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.06223v1-abstract-full" style="display: none;"> Representation Misdirection for Unlearning (RMU), which steers model representation in the intermediate layer to a target random representation, is an effective method for large language model (LLM) unlearning. Despite its high performance, the underlying cause and explanation remain underexplored. In this paper, we first theoretically demonstrate that steering forget representations in the intermediate layer reduces token confidence, causing LLMs to generate wrong or nonsense responses. Second, we investigate how the coefficient influences the alignment of forget-sample representations with the random direction and hint at the optimal coefficient values for effective unlearning across different network layers. Third, we show that RMU unlearned models are robust against adversarial jailbreak attacks. Last, our empirical analysis shows that RMU is less effective when applied to the middle and later layers in LLMs. To resolve this drawback, we propose Adaptive RMU -- a simple yet effective alternative method that makes unlearning effective with most layers. Extensive experiments demonstrate that Adaptive RMU significantly improves the unlearning performance compared to prior art while incurring no additional computational cost. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.06223v1-abstract-full').style.display = 'none'; document.getElementById('2408.06223v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages, 5 figures, 8 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.04174">arXiv:2408.04174</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2408.04174">pdf</a>, <a href="https://arxiv.org/format/2408.04174">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> wav2graph: A Framework for Supervised Learning Knowledge Graph from Speech </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Le-Duc%2C+K">Khai Le-Duc</a>, <a href="/search/cs?searchtype=author&amp;query=Dang%2C+Q">Quy-Anh Dang</a>, <a href="/search/cs?searchtype=author&amp;query=Pham%2C+T">Tan-Hanh Pham</a>, <a href="/search/cs?searchtype=author&amp;query=Hy%2C+T">Truong-Son Hy</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.04174v1-abstract-short" style="display: inline;"> Knowledge graphs (KGs) enhance the performance of large language models (LLMs) and search engines by providing structured, interconnected data that improves reasoning and context-awareness. However, KGs only focus on text data, thereby neglecting other modalities such as speech. In this work, we introduce wav2graph, the first framework for supervised learning knowledge graph from speech data. Our&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.04174v1-abstract-full').style.display = 'inline'; document.getElementById('2408.04174v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.04174v1-abstract-full" style="display: none;"> Knowledge graphs (KGs) enhance the performance of large language models (LLMs) and search engines by providing structured, interconnected data that improves reasoning and context-awareness. However, KGs only focus on text data, thereby neglecting other modalities such as speech. In this work, we introduce wav2graph, the first framework for supervised learning knowledge graph from speech data. Our pipeline are straightforward: (1) constructing a KG based on transcribed spoken utterances and a named entity database, (2) converting KG into embedding vectors, and (3) training graph neural networks (GNNs) for node classification and link prediction tasks. Through extensive experiments conducted in inductive and transductive learning contexts using state-of-the-art GNN models, we provide baseline results and error analysis for node classification and link prediction tasks on human transcripts and automatic speech recognition (ASR) transcripts, including evaluations using both encoder-based and decoder-based node embeddings, as well as monolingual and multilingual acoustic pre-trained models. All related code, data, and models are published online. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.04174v1-abstract-full').style.display = 'none'; document.getElementById('2408.04174v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Preprint, 32 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.04165">arXiv:2408.04165</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2408.04165">pdf</a>, <a href="https://arxiv.org/format/2408.04165">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Combinatorics">math.CO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Discrete Mathematics">cs.DM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Probability">math.PR</span> </div> </div> <p class="title is-5 mathjax"> Sunflowers in set systems with small VC-dimension </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Balogh%2C+J">J贸zsef Balogh</a>, <a href="/search/cs?searchtype=author&amp;query=Bernshteyn%2C+A">Anton Bernshteyn</a>, <a href="/search/cs?searchtype=author&amp;query=Delcourt%2C+M">Michelle Delcourt</a>, <a href="/search/cs?searchtype=author&amp;query=Ferber%2C+A">Asaf Ferber</a>, <a href="/search/cs?searchtype=author&amp;query=Pham%2C+H+T">Huy Tuan Pham</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.04165v1-abstract-short" style="display: inline;"> A family of $r$ distinct sets $\{A_1,\ldots, A_r\}$ is an $r$-sunflower if for all $1 \leqslant i &lt; j \leqslant r$ and $1 \leqslant i&#39; &lt; j&#39; \leqslant r$, we have $A_i \cap A_j = A_{i&#39;} \cap A_{j&#39;}$. Erd艖s and Rado conjectured in 1960 that every family $\mathcal{H}$ of $\ell$-element sets of size at least $K(r)^\ell$ contains an $r$-sunflower, where $K(r)$ is some function that depends only on $r$.&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.04165v1-abstract-full').style.display = 'inline'; document.getElementById('2408.04165v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.04165v1-abstract-full" style="display: none;"> A family of $r$ distinct sets $\{A_1,\ldots, A_r\}$ is an $r$-sunflower if for all $1 \leqslant i &lt; j \leqslant r$ and $1 \leqslant i&#39; &lt; j&#39; \leqslant r$, we have $A_i \cap A_j = A_{i&#39;} \cap A_{j&#39;}$. Erd艖s and Rado conjectured in 1960 that every family $\mathcal{H}$ of $\ell$-element sets of size at least $K(r)^\ell$ contains an $r$-sunflower, where $K(r)$ is some function that depends only on $r$. We prove that if $\mathcal{H}$ is a family of $\ell$-element sets of VC-dimension at most $d$ and $|\mathcal{H}| &gt; (C r (\log d+\log^\ast \ell))^\ell$ for some absolute constant $C &gt; 0$, then $\mathcal{H}$ contains an $r$-sunflower. This improves a recent result of Fox, Pach, and Suk. When $d=1$, we obtain a sharp bound, namely that $|\mathcal{H}| &gt; (r-1)^\ell$ is sufficient. Along the way, we establish a strengthening of the Kahn-Kalai conjecture for set families of bounded VC-dimension, which is of independent interest. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.04165v1-abstract-full').style.display = 'none'; document.getElementById('2408.04165v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">14 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.03637">arXiv:2408.03637</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2408.03637">pdf</a>, <a href="https://arxiv.org/format/2408.03637">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3664647.3681079">10.1145/3664647.3681079 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> TALE: Training-free Cross-domain Image Composition via Adaptive Latent Manipulation and Energy-guided Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Pham%2C+K+T">Kien T. Pham</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+J">Jingye Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Q">Qifeng Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.03637v1-abstract-short" style="display: inline;"> We present TALE, a novel training-free framework harnessing the generative capabilities of text-to-image diffusion models to address the cross-domain image composition task that focuses on flawlessly incorporating user-specified objects into a designated visual contexts regardless of domain disparity. Previous methods often involve either training auxiliary networks or finetuning diffusion models&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.03637v1-abstract-full').style.display = 'inline'; document.getElementById('2408.03637v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.03637v1-abstract-full" style="display: none;"> We present TALE, a novel training-free framework harnessing the generative capabilities of text-to-image diffusion models to address the cross-domain image composition task that focuses on flawlessly incorporating user-specified objects into a designated visual contexts regardless of domain disparity. Previous methods often involve either training auxiliary networks or finetuning diffusion models on customized datasets, which are expensive and may undermine the robust textual and visual priors of pre-trained diffusion models. Some recent works attempt to break the barrier by proposing training-free workarounds that rely on manipulating attention maps to tame the denoising process implicitly. However, composing via attention maps does not necessarily yield desired compositional outcomes. These approaches could only retain some semantic information and usually fall short in preserving identity characteristics of input objects or exhibit limited background-object style adaptation in generated images. In contrast, TALE is a novel method that operates directly on latent space to provide explicit and effective guidance for the composition process to resolve these problems. Specifically, we equip TALE with two mechanisms dubbed Adaptive Latent Manipulation and Energy-guided Latent Optimization. The former formulates noisy latents conducive to initiating and steering the composition process by directly leveraging background and foreground latents at corresponding timesteps, and the latter exploits designated energy functions to further optimize intermediate latents conforming to specific conditions that complement the former to generate desired final results. Our experiments demonstrate that TALE surpasses prior baselines and attains state-of-the-art performance in image-guided composition across various photorealistic and artistic domains. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.03637v1-abstract-full').style.display = 'none'; document.getElementById('2408.03637v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The 32nd ACM Multimedia Conference (MM &#39;24)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.00892">arXiv:2408.00892</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2408.00892">pdf</a>, <a href="https://arxiv.org/format/2408.00892">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Biomolecules">q-bio.BM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Peptide Sequencing Via Protein Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Pham%2C+T+L+H">Thuong Le Hoai Pham</a>, <a href="/search/cs?searchtype=author&amp;query=Saurav%2C+J+R">Jillur Rahman Saurav</a>, <a href="/search/cs?searchtype=author&amp;query=Omere%2C+A+A">Aisosa A. Omere</a>, <a href="/search/cs?searchtype=author&amp;query=Heyl%2C+C+J">Calvin J. Heyl</a>, <a href="/search/cs?searchtype=author&amp;query=Nasr%2C+M+S">Mohammad Sadegh Nasr</a>, <a href="/search/cs?searchtype=author&amp;query=Reynolds%2C+C+T">Cody Tyler Reynolds</a>, <a href="/search/cs?searchtype=author&amp;query=Veerla%2C+J+P+Y">Jai Prakash Yadav Veerla</a>, <a href="/search/cs?searchtype=author&amp;query=Shang%2C+H+H">Helen H Shang</a>, <a href="/search/cs?searchtype=author&amp;query=Jaworski%2C+J">Justyn Jaworski</a>, <a href="/search/cs?searchtype=author&amp;query=Ravenscraft%2C+A">Alison Ravenscraft</a>, <a href="/search/cs?searchtype=author&amp;query=Buonomo%2C+J+A">Joseph Anthony Buonomo</a>, <a href="/search/cs?searchtype=author&amp;query=Luber%2C+J+M">Jacob M. Luber</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.00892v1-abstract-short" style="display: inline;"> We introduce a protein language model for determining the complete sequence of a peptide based on measurement of a limited set of amino acids. To date, protein sequencing relies on mass spectrometry, with some novel edman degregation based platforms able to sequence non-native peptides. Current protein sequencing techniques face limitations in accurately identifying all amino acids, hindering comp&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.00892v1-abstract-full').style.display = 'inline'; document.getElementById('2408.00892v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.00892v1-abstract-full" style="display: none;"> We introduce a protein language model for determining the complete sequence of a peptide based on measurement of a limited set of amino acids. To date, protein sequencing relies on mass spectrometry, with some novel edman degregation based platforms able to sequence non-native peptides. Current protein sequencing techniques face limitations in accurately identifying all amino acids, hindering comprehensive proteome analysis. Our method simulates partial sequencing data by selectively masking amino acids that are experimentally difficult to identify in protein sequences from the UniRef database. This targeted masking mimics real-world sequencing limitations. We then modify and finetune a ProtBert derived transformer-based model, for a new downstream task predicting these masked residues, providing an approximation of the complete sequence. Evaluating on three bacterial Escherichia species, we achieve per-amino-acid accuracy up to 90.5% when only four amino acids ([KCYM]) are known. Structural assessment using AlphaFold and TM-score validates the biological relevance of our predictions. The model also demonstrates potential for evolutionary analysis through cross-species performance. This integration of simulated experimental constraints with computational predictions offers a promising avenue for enhancing protein sequence analysis, potentially accelerating advancements in proteomics and structural biology by providing a probabilistic reconstruction of the complete protein sequence from limited experimental data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.00892v1-abstract-full').style.display = 'none'; document.getElementById('2408.00892v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.12064">arXiv:2407.12064</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.12064">pdf</a>, <a href="https://arxiv.org/format/2407.12064">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> LiteGPT: Large Vision-Language Model for Joint Chest X-ray Localization and Classification Task </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Le-Duc%2C+K">Khai Le-Duc</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+R">Ryan Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Nguyen%2C+N+S">Ngoc Son Nguyen</a>, <a href="/search/cs?searchtype=author&amp;query=Pham%2C+T">Tan-Hanh Pham</a>, <a href="/search/cs?searchtype=author&amp;query=Dao%2C+A">Anh Dao</a>, <a href="/search/cs?searchtype=author&amp;query=Ngo%2C+B+H">Ba Hung Ngo</a>, <a href="/search/cs?searchtype=author&amp;query=Nguyen%2C+A+T">Anh Totti Nguyen</a>, <a href="/search/cs?searchtype=author&amp;query=Hy%2C+T">Truong-Son Hy</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.12064v1-abstract-short" style="display: inline;"> Vision-language models have been extensively explored across a wide range of tasks, achieving satisfactory performance; however, their application in medical imaging remains underexplored. In this work, we propose a unified framework - LiteGPT - for the medical imaging. We leverage multiple pre-trained visual encoders to enrich information and enhance the performance of vision-language models. To&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.12064v1-abstract-full').style.display = 'inline'; document.getElementById('2407.12064v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.12064v1-abstract-full" style="display: none;"> Vision-language models have been extensively explored across a wide range of tasks, achieving satisfactory performance; however, their application in medical imaging remains underexplored. In this work, we propose a unified framework - LiteGPT - for the medical imaging. We leverage multiple pre-trained visual encoders to enrich information and enhance the performance of vision-language models. To the best of our knowledge, this is the first study to utilize vision-language models for the novel task of joint localization and classification in medical images. Besides, we are pioneers in providing baselines for disease localization in chest X-rays. Finally, we set new state-of-the-art performance in the image classification task on the well-benchmarked VinDr-CXR dataset. All code and models are publicly available online: https://github.com/leduckhai/LiteGPT <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.12064v1-abstract-full').style.display = 'none'; document.getElementById('2407.12064v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Preprint, 19 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.06349">arXiv:2407.06349</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.06349">pdf</a>, <a href="https://arxiv.org/format/2407.06349">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Large Language Model Recall Uncertainty is Modulated by the Fan Effect </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Roberts%2C+J">Jesse Roberts</a>, <a href="/search/cs?searchtype=author&amp;query=Moore%2C+K">Kyle Moore</a>, <a href="/search/cs?searchtype=author&amp;query=Pham%2C+T">Thao Pham</a>, <a href="/search/cs?searchtype=author&amp;query=Ewaleifoh%2C+O">Oseremhen Ewaleifoh</a>, <a href="/search/cs?searchtype=author&amp;query=Fisher%2C+D">Doug Fisher</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.06349v2-abstract-short" style="display: inline;"> This paper evaluates whether large language models (LLMs) exhibit cognitive fan effects, similar to those discovered by Anderson in humans, after being pre-trained on human textual data. We conduct two sets of in-context recall experiments designed to elicit fan effects. Consistent with human results, we find that LLM recall uncertainty, measured via token probability, is influenced by the fan eff&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.06349v2-abstract-full').style.display = 'inline'; document.getElementById('2407.06349v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.06349v2-abstract-full" style="display: none;"> This paper evaluates whether large language models (LLMs) exhibit cognitive fan effects, similar to those discovered by Anderson in humans, after being pre-trained on human textual data. We conduct two sets of in-context recall experiments designed to elicit fan effects. Consistent with human results, we find that LLM recall uncertainty, measured via token probability, is influenced by the fan effect. Our results show that removing uncertainty disrupts the observed effect. The experiments suggest the fan effect is consistent whether the fan value is induced in-context or in the pre-training data. Finally, these findings provide in-silico evidence that fan effects and typicality are expressions of the same phenomena. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.06349v2-abstract-full').style.display = 'none'; document.getElementById('2407.06349v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.17272">arXiv:2406.17272</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.17272">pdf</a>, <a href="https://arxiv.org/ps/2406.17272">ps</a>, <a href="https://arxiv.org/format/2406.17272">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> A Comprehensive Solution to Connect Speech Encoder and Large Language Model for ASR </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Pham%2C+V+T">Van Tung Pham</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+Y">Yist Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Han%2C+T">Tao Han</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+W">Wei Li</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+J">Jun Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+L">Lu Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yuxuan Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.17272v1-abstract-short" style="display: inline;"> Recent works have shown promising results in connecting speech encoders to large language models (LLMs) for speech recognition. However, several limitations persist, including limited fine-tuning options, a lack of mechanisms to enforce speech-text alignment, and high insertion errors especially in domain mismatch conditions. This paper presents a comprehensive solution to address these issues. We&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.17272v1-abstract-full').style.display = 'inline'; document.getElementById('2406.17272v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.17272v1-abstract-full" style="display: none;"> Recent works have shown promising results in connecting speech encoders to large language models (LLMs) for speech recognition. However, several limitations persist, including limited fine-tuning options, a lack of mechanisms to enforce speech-text alignment, and high insertion errors especially in domain mismatch conditions. This paper presents a comprehensive solution to address these issues. We begin by investigating more thoughtful fine-tuning schemes. Next, we propose a matching loss to enhance alignment between modalities. Finally, we explore training and inference methods to mitigate high insertion errors. Experimental results on the Librispeech corpus demonstrate that partially fine-tuning the encoder and LLM using parameter-efficient methods, such as LoRA, is the most cost-effective approach. Additionally, the matching loss improves modality alignment, enhancing performance. The proposed training and inference methods significantly reduce insertion errors. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.17272v1-abstract-full').style.display = 'none'; document.getElementById('2406.17272v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.13725">arXiv:2406.13725</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.13725">pdf</a>, <a href="https://arxiv.org/format/2406.13725">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Tree-Sliced Wasserstein Distance on a System of Lines </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Tran%2C+V">Viet-Hoang Tran</a>, <a href="/search/cs?searchtype=author&amp;query=Pham%2C+T">Trang Pham</a>, <a href="/search/cs?searchtype=author&amp;query=Tran%2C+T">Tho Tran</a>, <a href="/search/cs?searchtype=author&amp;query=Le%2C+T">Tam Le</a>, <a href="/search/cs?searchtype=author&amp;query=Nguyen%2C+T+M">Tan M. Nguyen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.13725v1-abstract-short" style="display: inline;"> Sliced Wasserstein (SW) distance in Optimal Transport (OT) is widely used in various applications thanks to its statistical effectiveness and computational efficiency. On the other hand, Tree Wassenstein (TW) and Tree-sliced Wassenstein (TSW) are instances of OT for probability measures where its ground cost is a tree metric. TSW also has a low computational complexity, i.e. linear to the number o&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.13725v1-abstract-full').style.display = 'inline'; document.getElementById('2406.13725v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.13725v1-abstract-full" style="display: none;"> Sliced Wasserstein (SW) distance in Optimal Transport (OT) is widely used in various applications thanks to its statistical effectiveness and computational efficiency. On the other hand, Tree Wassenstein (TW) and Tree-sliced Wassenstein (TSW) are instances of OT for probability measures where its ground cost is a tree metric. TSW also has a low computational complexity, i.e. linear to the number of edges in the tree. Especially, TSW is identical to SW when the tree is a chain. While SW is prone to loss of topological information of input measures due to relying on one-dimensional projection, TSW is more flexible and has a higher degree of freedom by choosing a tree rather than a line to alleviate the curse of dimensionality in SW. However, for practical applications, popular tree metric sampling methods are heavily built upon given supports, which limits their capacity to adapt to new supports. In this paper, we propose the Tree-Sliced Wasserstein distance on a System of Lines (TSW-SL), which brings a connection between SW and TSW. Compared to SW and TSW, our TSW-SL benefits from the higher degree of freedom of TSW while being suitable to dynamic settings as SW. In TSW-SL, we use a variant of the Radon Transform to project measures onto a system of lines, resulting in measures on a space with a tree metric, then leverage TW to efficiently compute distances between them. We empirically verify the advantages of TSW-SL over the traditional SW by conducting a variety of experiments on gradient flows, image style transfer, and generative models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.13725v1-abstract-full').style.display = 'none'; document.getElementById('2406.13725v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">33 pages, 6 figures, 2 tables, 4 algorithms</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.11634">arXiv:2406.11634</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.11634">pdf</a>, <a href="https://arxiv.org/format/2406.11634">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> The Base-Rate Effect on LLM Benchmark Performance: Disambiguating Test-Taking Strategies from Benchmark Performance </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Moore%2C+K">Kyle Moore</a>, <a href="/search/cs?searchtype=author&amp;query=Roberts%2C+J">Jesse Roberts</a>, <a href="/search/cs?searchtype=author&amp;query=Pham%2C+T">Thao Pham</a>, <a href="/search/cs?searchtype=author&amp;query=Ewaleifoh%2C+O">Oseremhen Ewaleifoh</a>, <a href="/search/cs?searchtype=author&amp;query=Fisher%2C+D">Doug Fisher</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.11634v2-abstract-short" style="display: inline;"> Cloze testing is a common method for measuring the behavior of large language models on a number of benchmark tasks. Using the MMLU dataset, we show that the base-rate probability (BRP) differences across answer tokens are significant and affect task performance ie. guess A if uncertain. We find that counterfactual prompting does sufficiently mitigate the BRP effect. The BRP effect is found to hav&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.11634v2-abstract-full').style.display = 'inline'; document.getElementById('2406.11634v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.11634v2-abstract-full" style="display: none;"> Cloze testing is a common method for measuring the behavior of large language models on a number of benchmark tasks. Using the MMLU dataset, we show that the base-rate probability (BRP) differences across answer tokens are significant and affect task performance ie. guess A if uncertain. We find that counterfactual prompting does sufficiently mitigate the BRP effect. The BRP effect is found to have a similar effect to test taking strategies employed by humans leading to the conflation of task performance and test-taking ability. We propose the Nvr-X-MMLU task, a variation of MMLU, which helps to disambiguate test-taking ability from task performance and reports the latter. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.11634v2-abstract-full').style.display = 'none'; document.getElementById('2406.11634v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.09717">arXiv:2406.09717</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.09717">pdf</a>, <a href="https://arxiv.org/format/2406.09717">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> UniBridge: A Unified Approach to Cross-Lingual Transfer Learning for Low-Resource Languages </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Pham%2C+T">Trinh Pham</a>, <a href="/search/cs?searchtype=author&amp;query=Le%2C+K+M">Khoi M. Le</a>, <a href="/search/cs?searchtype=author&amp;query=Tuan%2C+L+A">Luu Anh Tuan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.09717v3-abstract-short" style="display: inline;"> In this paper, we introduce UniBridge (Cross-Lingual Transfer Learning with Optimized Embeddings and Vocabulary), a comprehensive approach developed to improve the effectiveness of Cross-Lingual Transfer Learning, particularly in languages with limited resources. Our approach tackles two essential elements of a language model: the initialization of embeddings and the optimal vocabulary size. Speci&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.09717v3-abstract-full').style.display = 'inline'; document.getElementById('2406.09717v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.09717v3-abstract-full" style="display: none;"> In this paper, we introduce UniBridge (Cross-Lingual Transfer Learning with Optimized Embeddings and Vocabulary), a comprehensive approach developed to improve the effectiveness of Cross-Lingual Transfer Learning, particularly in languages with limited resources. Our approach tackles two essential elements of a language model: the initialization of embeddings and the optimal vocabulary size. Specifically, we propose a novel embedding initialization method that leverages both lexical and semantic alignment for a language. In addition, we present a method for systematically searching for the optimal vocabulary size, ensuring a balance between model complexity and linguistic coverage. Our experiments across multilingual datasets show that our approach greatly improves the F1-Score in several languages. UniBridge is a robust and adaptable solution for cross-lingual systems in various languages, highlighting the significance of initializing embeddings and choosing the right vocabulary size in cross-lingual environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.09717v3-abstract-full').style.display = 'none'; document.getElementById('2406.09717v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">First two authors contribute equally. Accepted at ACL 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.08953">arXiv:2406.08953</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.08953">pdf</a>, <a href="https://arxiv.org/format/2406.08953">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Preserving Identity with Variational Score for General-purpose 3D Editing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Le%2C+D+H">Duong H. Le</a>, <a href="/search/cs?searchtype=author&amp;query=Pham%2C+T">Tuan Pham</a>, <a href="/search/cs?searchtype=author&amp;query=Kembhavi%2C+A">Aniruddha Kembhavi</a>, <a href="/search/cs?searchtype=author&amp;query=Mandt%2C+S">Stephan Mandt</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+W">Wei-Chiu Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+J">Jiasen Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.08953v1-abstract-short" style="display: inline;"> We present Piva (Preserving Identity with Variational Score Distillation), a novel optimization-based method for editing images and 3D models based on diffusion models. Specifically, our approach is inspired by the recently proposed method for 2D image editing - Delta Denoising Score (DDS). We pinpoint the limitations in DDS for 2D and 3D editing, which causes detail loss and over-saturation. To a&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.08953v1-abstract-full').style.display = 'inline'; document.getElementById('2406.08953v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.08953v1-abstract-full" style="display: none;"> We present Piva (Preserving Identity with Variational Score Distillation), a novel optimization-based method for editing images and 3D models based on diffusion models. Specifically, our approach is inspired by the recently proposed method for 2D image editing - Delta Denoising Score (DDS). We pinpoint the limitations in DDS for 2D and 3D editing, which causes detail loss and over-saturation. To address this, we propose an additional score distillation term that enforces identity preservation. This results in a more stable editing process, gradually optimizing NeRF models to match target prompts while retaining crucial input characteristics. We demonstrate the effectiveness of our approach in zero-shot image and neural field editing. Our method successfully alters visual attributes, adds both subtle and substantial structural elements, translates shapes, and achieves competitive results on standard 2D and 3D editing benchmarks. Additionally, our method imposes no constraints like masking or pre-training, making it compatible with a wide range of pre-trained diffusion models. This allows for versatile editing without needing neural field-to-mesh conversion, offering a more user-friendly experience. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.08953v1-abstract-full').style.display = 'none'; document.getElementById('2406.08953v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">22 pages, 14 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.08943">arXiv:2406.08943</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.08943">pdf</a>, <a href="https://arxiv.org/format/2406.08943">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Neural NeRF Compression </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Pham%2C+T">Tuan Pham</a>, <a href="/search/cs?searchtype=author&amp;query=Mandt%2C+S">Stephan Mandt</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.08943v1-abstract-short" style="display: inline;"> Neural Radiance Fields (NeRFs) have emerged as powerful tools for capturing detailed 3D scenes through continuous volumetric representations. Recent NeRFs utilize feature grids to improve rendering quality and speed; however, these representations introduce significant storage overhead. This paper presents a novel method for efficiently compressing a grid-based NeRF model, addressing the storage o&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.08943v1-abstract-full').style.display = 'inline'; document.getElementById('2406.08943v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.08943v1-abstract-full" style="display: none;"> Neural Radiance Fields (NeRFs) have emerged as powerful tools for capturing detailed 3D scenes through continuous volumetric representations. Recent NeRFs utilize feature grids to improve rendering quality and speed; however, these representations introduce significant storage overhead. This paper presents a novel method for efficiently compressing a grid-based NeRF model, addressing the storage overhead concern. Our approach is based on the non-linear transform coding paradigm, employing neural compression for compressing the model&#39;s feature grids. Due to the lack of training data involving many i.i.d scenes, we design an encoder-free, end-to-end optimized approach for individual scenes, using lightweight decoders. To leverage the spatial inhomogeneity of the latent feature grids, we introduce an importance-weighted rate-distortion objective and a sparse entropy model employing a masking mechanism. Our experimental results validate that our proposed method surpasses existing works in terms of grid-based NeRF compression efficacy and reconstruction quality. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.08943v1-abstract-full').style.display = 'none'; document.getElementById('2406.08943v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ICML 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.03431">arXiv:2406.03431</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.03431">pdf</a>, <a href="https://arxiv.org/format/2406.03431">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> CattleFace-RGBT: RGB-T Cattle Facial Landmark Benchmark </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Coffman%2C+E">Ethan Coffman</a>, <a href="/search/cs?searchtype=author&amp;query=Clark%2C+R">Reagan Clark</a>, <a href="/search/cs?searchtype=author&amp;query=Bui%2C+N">Nhat-Tan Bui</a>, <a href="/search/cs?searchtype=author&amp;query=Pham%2C+T+T">Trong Thang Pham</a>, <a href="/search/cs?searchtype=author&amp;query=Kegley%2C+B">Beth Kegley</a>, <a href="/search/cs?searchtype=author&amp;query=Powell%2C+J+G">Jeremy G. Powell</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+J">Jiangchao Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Le%2C+N">Ngan Le</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.03431v1-abstract-short" style="display: inline;"> To address this challenge, we introduce CattleFace-RGBT, a RGB-T Cattle Facial Landmark dataset consisting of 2,300 RGB-T image pairs, a total of 4,600 images. Creating a landmark dataset is time-consuming, but AI-assisted annotation can help. However, applying AI to thermal images is challenging due to suboptimal results from direct thermal training and infeasible RGB-thermal alignment due to dif&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.03431v1-abstract-full').style.display = 'inline'; document.getElementById('2406.03431v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.03431v1-abstract-full" style="display: none;"> To address this challenge, we introduce CattleFace-RGBT, a RGB-T Cattle Facial Landmark dataset consisting of 2,300 RGB-T image pairs, a total of 4,600 images. Creating a landmark dataset is time-consuming, but AI-assisted annotation can help. However, applying AI to thermal images is challenging due to suboptimal results from direct thermal training and infeasible RGB-thermal alignment due to different camera views. Therefore, we opt to transfer models trained on RGB to thermal images and refine them using our AI-assisted annotation tool following a semi-automatic annotation approach. Accurately localizing facial key points on both RGB and thermal images enables us to not only discern the cattle&#39;s respiratory signs but also measure temperatures to assess the animal&#39;s thermal state. To the best of our knowledge, this is the first dataset for the cattle facial landmark on RGB-T images. We conduct benchmarking of the CattleFace-RGBT dataset across various backbone architectures, with the objective of establishing baselines for future research, analysis, and comparison. The dataset and models are at https://github.com/UARK-AICV/CattleFace-RGBT-benchmark <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.03431v1-abstract-full').style.display = 'none'; document.getElementById('2406.03431v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.14131">arXiv:2405.14131</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2405.14131">pdf</a>, <a href="https://arxiv.org/format/2405.14131">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Statistical Advantages of Perturbing Cosine Router in Mixture of Experts </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Nguyen%2C+H">Huy Nguyen</a>, <a href="/search/cs?searchtype=author&amp;query=Akbarian%2C+P">Pedram Akbarian</a>, <a href="/search/cs?searchtype=author&amp;query=Pham%2C+T">Trang Pham</a>, <a href="/search/cs?searchtype=author&amp;query=Nguyen%2C+T">Trang Nguyen</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+S">Shujian Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Ho%2C+N">Nhat Ho</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.14131v2-abstract-short" style="display: inline;"> The cosine router in Mixture of Experts (MoE) has recently emerged as an attractive alternative to the conventional linear router. Indeed, the cosine router demonstrates favorable performance in image and language tasks and exhibits better ability to mitigate the representation collapse issue, which often leads to parameter redundancy and limited representation potentials. Despite its empirical su&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.14131v2-abstract-full').style.display = 'inline'; document.getElementById('2405.14131v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.14131v2-abstract-full" style="display: none;"> The cosine router in Mixture of Experts (MoE) has recently emerged as an attractive alternative to the conventional linear router. Indeed, the cosine router demonstrates favorable performance in image and language tasks and exhibits better ability to mitigate the representation collapse issue, which often leads to parameter redundancy and limited representation potentials. Despite its empirical success, a comprehensive analysis of the cosine router in MoE has been lacking. Considering the least square estimation of the cosine routing MoE, we demonstrate that due to the intrinsic interaction of the model parameters in the cosine router via some partial differential equations, regardless of the structures of the experts, the estimation rates of experts and model parameters can be as slow as $\mathcal{O}(1/\log^蟿(n))$ where $蟿&gt; 0$ is some constant and $n$ is the sample size. Surprisingly, these pessimistic non-polynomial convergence rates can be circumvented by the widely used technique in practice to stabilize the cosine router -- simply adding noises to the $L^2$ norms in the cosine router, which we refer to as \textit{perturbed cosine router}. Under the strongly identifiable settings of the expert functions, we prove that the estimation rates for both the experts and model parameters under the perturbed cosine routing MoE are significantly improved to polynomial rates. Finally, we conduct extensive simulation studies in both synthetic and real data settings to empirically validate our theoretical results. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.14131v2-abstract-full').style.display = 'none'; document.getElementById('2405.14131v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">40 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.14124">arXiv:2405.14124</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2405.14124">pdf</a>, <a href="https://arxiv.org/format/2405.14124">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Mixture of Experts Meets Prompt-Based Continual Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Le%2C+M">Minh Le</a>, <a href="/search/cs?searchtype=author&amp;query=Nguyen%2C+A">An Nguyen</a>, <a href="/search/cs?searchtype=author&amp;query=Nguyen%2C+H">Huy Nguyen</a>, <a href="/search/cs?searchtype=author&amp;query=Nguyen%2C+T">Trang Nguyen</a>, <a href="/search/cs?searchtype=author&amp;query=Pham%2C+T">Trang Pham</a>, <a href="/search/cs?searchtype=author&amp;query=Van+Ngo%2C+L">Linh Van Ngo</a>, <a href="/search/cs?searchtype=author&amp;query=Ho%2C+N">Nhat Ho</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.14124v3-abstract-short" style="display: inline;"> Exploiting the power of pre-trained models, prompt-based approaches stand out compared to other continual learning solutions in effectively preventing catastrophic forgetting, even with very few learnable parameters and without the need for a memory buffer. While existing prompt-based continual learning methods excel in leveraging prompts for state-of-the-art performance, they often lack a theoret&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.14124v3-abstract-full').style.display = 'inline'; document.getElementById('2405.14124v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.14124v3-abstract-full" style="display: none;"> Exploiting the power of pre-trained models, prompt-based approaches stand out compared to other continual learning solutions in effectively preventing catastrophic forgetting, even with very few learnable parameters and without the need for a memory buffer. While existing prompt-based continual learning methods excel in leveraging prompts for state-of-the-art performance, they often lack a theoretical explanation for the effectiveness of prompting. This paper conducts a theoretical analysis to unravel how prompts bestow such advantages in continual learning, thus offering a new perspective on prompt design. We first show that the attention block of pre-trained models like Vision Transformers inherently encodes a special mixture of experts architecture, characterized by linear experts and quadratic gating score functions. This realization drives us to provide a novel view on prefix tuning, reframing it as the addition of new task-specific experts, thereby inspiring the design of a novel gating mechanism termed Non-linear Residual Gates (NoRGa). Through the incorporation of non-linear activation and residual connection, NoRGa enhances continual learning performance while preserving parameter efficiency. The effectiveness of NoRGa is substantiated both theoretically and empirically across diverse benchmarks and pretraining paradigms. Our code is publicly available at https://github.com/Minhchuyentoancbn/MoE_PromptCL <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.14124v3-abstract-full').style.display = 'none'; document.getElementById('2405.14124v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to NeurIPS 2024, 30 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.06816">arXiv:2405.06816</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2405.06816">pdf</a>, <a href="https://arxiv.org/format/2405.06816">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Non-stationary Domain Generalization: Theory and Algorithm </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Pham%2C+T">Thai-Hoang Pham</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+X">Xueru Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+P">Ping Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.06816v1-abstract-short" style="display: inline;"> Although recent advances in machine learning have shown its success to learn from independent and identically distributed (IID) data, it is vulnerable to out-of-distribution (OOD) data in an open world. Domain generalization (DG) deals with such an issue and it aims to learn a model from multiple source domains that can be generalized to unseen target domains. Existing studies on DG have largely f&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.06816v1-abstract-full').style.display = 'inline'; document.getElementById('2405.06816v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.06816v1-abstract-full" style="display: none;"> Although recent advances in machine learning have shown its success to learn from independent and identically distributed (IID) data, it is vulnerable to out-of-distribution (OOD) data in an open world. Domain generalization (DG) deals with such an issue and it aims to learn a model from multiple source domains that can be generalized to unseen target domains. Existing studies on DG have largely focused on stationary settings with homogeneous source domains. However, in many applications, domains may evolve along a specific direction (e.g., time, space). Without accounting for such non-stationary patterns, models trained with existing methods may fail to generalize on OOD data. In this paper, we study domain generalization in non-stationary environment. We first examine the impact of environmental non-stationarity on model performance and establish the theoretical upper bounds for the model error at target domains. Then, we propose a novel algorithm based on adaptive invariant representation learning, which leverages the non-stationary pattern to train a model that attains good performance on target domains. Experiments on both synthetic and real data validate the proposed algorithm. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.06816v1-abstract-full').style.display = 'none'; document.getElementById('2405.06816v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by UAI 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.13204">arXiv:2403.13204</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2403.13204">pdf</a>, <a href="https://arxiv.org/format/2403.13204">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Diversity-Aware Agnostic Ensemble of Sharpness Minimizers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Bui%2C+A">Anh Bui</a>, <a href="/search/cs?searchtype=author&amp;query=Vo%2C+V">Vy Vo</a>, <a href="/search/cs?searchtype=author&amp;query=Pham%2C+T">Tung Pham</a>, <a href="/search/cs?searchtype=author&amp;query=Phung%2C+D">Dinh Phung</a>, <a href="/search/cs?searchtype=author&amp;query=Le%2C+T">Trung Le</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.13204v1-abstract-short" style="display: inline;"> There has long been plenty of theoretical and empirical evidence supporting the success of ensemble learning. Deep ensembles in particular take advantage of training randomness and expressivity of individual neural networks to gain prediction diversity, ultimately leading to better generalization, robustness and uncertainty estimation. In respect of generalization, it is found that pursuing wider&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.13204v1-abstract-full').style.display = 'inline'; document.getElementById('2403.13204v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.13204v1-abstract-full" style="display: none;"> There has long been plenty of theoretical and empirical evidence supporting the success of ensemble learning. Deep ensembles in particular take advantage of training randomness and expressivity of individual neural networks to gain prediction diversity, ultimately leading to better generalization, robustness and uncertainty estimation. In respect of generalization, it is found that pursuing wider local minima result in models being more robust to shifts between training and testing sets. A natural research question arises out of these two approaches as to whether a boost in generalization ability can be achieved if ensemble learning and loss sharpness minimization are integrated. Our work investigates this connection and proposes DASH - a learning algorithm that promotes diversity and flatness within deep ensembles. More concretely, DASH encourages base learners to move divergently towards low-loss regions of minimal sharpness. We provide a theoretical backbone for our method along with extensive empirical evidence demonstrating an improvement in ensemble generalizability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.13204v1-abstract-full').style.display = 'none'; document.getElementById('2403.13204v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.06166">arXiv:2403.06166</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2403.06166">pdf</a>, <a href="https://arxiv.org/format/2403.06166">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Cross-Cluster Shifting for Efficient and Effective 3D Object Detection in Autonomous Driving </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Z">Zhili Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Pham%2C+K+T">Kien T. Pham</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+M">Maosheng Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Shen%2C+Z">Zhiqiang Shen</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Q">Qifeng Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.06166v1-abstract-short" style="display: inline;"> We present a new 3D point-based detector model, named Shift-SSD, for precise 3D object detection in autonomous driving. Traditional point-based 3D object detectors often employ architectures that rely on a progressive downsampling of points. While this method effectively reduces computational demands and increases receptive fields, it will compromise the preservation of crucial non-local informati&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.06166v1-abstract-full').style.display = 'inline'; document.getElementById('2403.06166v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.06166v1-abstract-full" style="display: none;"> We present a new 3D point-based detector model, named Shift-SSD, for precise 3D object detection in autonomous driving. Traditional point-based 3D object detectors often employ architectures that rely on a progressive downsampling of points. While this method effectively reduces computational demands and increases receptive fields, it will compromise the preservation of crucial non-local information for accurate 3D object detection, especially in the complex driving scenarios. To address this, we introduce an intriguing Cross-Cluster Shifting operation to unleash the representation capacity of the point-based detector by efficiently modeling longer-range inter-dependency while including only a negligible overhead. Concretely, the Cross-Cluster Shifting operation enhances the conventional design by shifting partial channels from neighboring clusters, which enables richer interaction with non-local regions and thus enlarges the receptive field of clusters. We conduct extensive experiments on the KITTI, Waymo, and nuScenes datasets, and the results demonstrate the state-of-the-art performance of Shift-SSD in both detection accuracy and runtime efficiency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.06166v1-abstract-full').style.display = 'none'; document.getElementById('2403.06166v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICRA2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.05297">arXiv:2403.05297</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2403.05297">pdf</a>, <a href="https://arxiv.org/format/2403.05297">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> PEEB: Part-based Image Classifiers with an Explainable and Editable Language Bottleneck </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Pham%2C+T+M">Thang M. Pham</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+P">Peijie Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Nguyen%2C+T">Tin Nguyen</a>, <a href="/search/cs?searchtype=author&amp;query=Yoon%2C+S">Seunghyun Yoon</a>, <a href="/search/cs?searchtype=author&amp;query=Bui%2C+T">Trung Bui</a>, <a href="/search/cs?searchtype=author&amp;query=Nguyen%2C+A+T">Anh Totti Nguyen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.05297v3-abstract-short" style="display: inline;"> CLIP-based classifiers rely on the prompt containing a {class name} that is known to the text encoder. Therefore, they perform poorly on new classes or the classes whose names rarely appear on the Internet (e.g., scientific names of birds). For fine-grained classification, we propose PEEB - an explainable and editable classifier to (1) express the class name into a set of text descriptors that des&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.05297v3-abstract-full').style.display = 'inline'; document.getElementById('2403.05297v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.05297v3-abstract-full" style="display: none;"> CLIP-based classifiers rely on the prompt containing a {class name} that is known to the text encoder. Therefore, they perform poorly on new classes or the classes whose names rarely appear on the Internet (e.g., scientific names of birds). For fine-grained classification, we propose PEEB - an explainable and editable classifier to (1) express the class name into a set of text descriptors that describe the visual parts of that class; and (2) match the embeddings of the detected parts to their textual descriptors in each class to compute a logit score for classification. In a zero-shot setting where the class names are unknown, PEEB outperforms CLIP by a huge margin (~10x in top-1 accuracy). Compared to part-based classifiers, PEEB is not only the state-of-the-art (SOTA) on the supervised-learning setting (88.80% and 92.20% accuracy on CUB-200 and Dogs-120, respectively) but also the first to enable users to edit the text descriptors to form a new classifier without any re-training. Compared to concept bottleneck models, PEEB is also the SOTA in both zero-shot and supervised-learning settings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.05297v3-abstract-full').style.display = 'none'; document.getElementById('2403.05297v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Findings of NAACL 2024 (long paper)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.04100">arXiv:2403.04100</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2403.04100">pdf</a>, <a href="https://arxiv.org/format/2403.04100">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Algebraic Topology">math.AT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computational Geometry">cs.CG</span> </div> </div> <p class="title is-5 mathjax"> Computing Representatives of Persistent Homology Generators with a Double Twist </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Pham%2C+T">Tuyen Pham</a>, <a href="/search/cs?searchtype=author&amp;query=Wagner%2C+H">Hubert Wagner</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.04100v1-abstract-short" style="display: inline;"> With the growing availability of efficient tools, persistent homology is becoming a useful methodology in a variety of applications. Significant work has been devoted to implementing tools for persistent homology diagrams; however, computing representative cycles corresponding to each point in the diagram can still be inefficient. To circumvent this problem, we extend the twist algorithm of Chen a&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.04100v1-abstract-full').style.display = 'inline'; document.getElementById('2403.04100v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.04100v1-abstract-full" style="display: none;"> With the growing availability of efficient tools, persistent homology is becoming a useful methodology in a variety of applications. Significant work has been devoted to implementing tools for persistent homology diagrams; however, computing representative cycles corresponding to each point in the diagram can still be inefficient. To circumvent this problem, we extend the twist algorithm of Chen and Kerber. Our extension is based on a new technique we call saving, which supplements their existing killing technique. The resulting two-pass strategy can be realized using an existing matrix reduction implementation as a black-box and improves the efficiency of computing representatives of persistent homology generators. We prove the correctness of the new approach and experimentally show its performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.04100v1-abstract-full').style.display = 'none'; document.getElementById('2403.04100v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Canadian Conference on Computational Geometry, Volume 35, 283-290 (2023) </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.15909">arXiv:2402.15909</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2402.15909">pdf</a>, <a href="https://arxiv.org/format/2402.15909">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Enhanced Droplet Analysis Using Generative Adversarial Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Pham%2C+T">Tan-Hanh Pham</a>, <a href="/search/cs?searchtype=author&amp;query=Nguyen%2C+K">Kim-Doang Nguyen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.15909v3-abstract-short" style="display: inline;"> Precision devices play an important role in enhancing production quality and productivity in agricultural systems. Therefore, the optimization of these devices is essential in precision agriculture. Recently, with the advancements of deep learning, there have been several studies aiming to harness its capabilities for improving spray system performance. However, the effectiveness of these methods&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.15909v3-abstract-full').style.display = 'inline'; document.getElementById('2402.15909v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.15909v3-abstract-full" style="display: none;"> Precision devices play an important role in enhancing production quality and productivity in agricultural systems. Therefore, the optimization of these devices is essential in precision agriculture. Recently, with the advancements of deep learning, there have been several studies aiming to harness its capabilities for improving spray system performance. However, the effectiveness of these methods heavily depends on the size of the training dataset, which is expensive and time-consuming to collect. To address the challenge of insufficient training samples, we developed an image generator named DropletGAN to generate images of droplets. The DropletGAN model is trained by using a small dataset captured by a high-speed camera and capable of generating images with progressively increasing resolution. The results demonstrate that the model can generate high-quality images with the size of 1024x1024. The generated images from the DropletGAN are evaluated using the Fr茅chet inception distance (FID) with an FID score of 11.29. Furthermore, this research leverages recent advancements in computer vision and deep learning to develop a light droplet detector using the synthetic dataset. As a result, the detection model achieves a 16.06% increase in mean average precision (mAP) when utilizing the synthetic dataset. To the best of our knowledge, this work stands as the first to employ a generative model for augmenting droplet detection. Its significance lies not only in optimizing nozzle design for constructing efficient spray systems but also in addressing the common challenge of insufficient data in various precision agriculture tasks. This work offers a critical contribution to conserving resources while striving for optimal and sustainable agricultural practices. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.15909v3-abstract-full').style.display = 'none'; document.getElementById('2402.15909v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.15677">arXiv:2402.15677</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2402.15677">pdf</a>, <a href="https://arxiv.org/format/2402.15677">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> </div> <p class="title is-5 mathjax"> Consensus seeking in diffusive multidimensional networks with a repeated interaction pattern and time-delays </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Vu%2C+H+H">Hoang Huy Vu</a>, <a href="/search/cs?searchtype=author&amp;query=Nguyen%2C+Q+N">Quyen Ngoc Nguyen</a>, <a href="/search/cs?searchtype=author&amp;query=Van+Nguyen%2C+C">Chuong Van Nguyen</a>, <a href="/search/cs?searchtype=author&amp;query=Van+Pham%2C+T">Tuynh Van Pham</a>, <a href="/search/cs?searchtype=author&amp;query=Trinh%2C+M+H">Minh Hoang Trinh</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.15677v1-abstract-short" style="display: inline;"> This paper studies a consensus problem in multidimensional networks having the same agent-to-agent interaction pattern under both intra- and cross-layer time delays. Several conditions for the agents to globally asymptotically achieve a consensus are derived, which involve the overall network&#39;s structure, the local interacting pattern, and the values of the time delays. The validity of these condi&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.15677v1-abstract-full').style.display = 'inline'; document.getElementById('2402.15677v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.15677v1-abstract-full" style="display: none;"> This paper studies a consensus problem in multidimensional networks having the same agent-to-agent interaction pattern under both intra- and cross-layer time delays. Several conditions for the agents to globally asymptotically achieve a consensus are derived, which involve the overall network&#39;s structure, the local interacting pattern, and the values of the time delays. The validity of these conditions is proved by direct eigenvalue evaluation and supported by numerical simulations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.15677v1-abstract-full').style.display = 'none'; document.getElementById('2402.15677v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">6 pages, 7 figures, submitted to a journal</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.13554">arXiv:2402.13554</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2402.13554">pdf</a>, <a href="https://arxiv.org/ps/2402.13554">ps</a>, <a href="https://arxiv.org/format/2402.13554">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Secrecy Performance Analysis of Space-to-Ground Optical Satellite Communications </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Nguyen%2C+T+V">Thang V. Nguyen</a>, <a href="/search/cs?searchtype=author&amp;query=Pham%2C+T+V">Thanh V. Pham</a>, <a href="/search/cs?searchtype=author&amp;query=Pham%2C+A+T">Anh T. Pham</a>, <a href="/search/cs?searchtype=author&amp;query=Ngoc%2C+D+T">Dang T. Ngoc</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.13554v1-abstract-short" style="display: inline;"> Free-space optics (FSO)-based satellite communication systems have recently received considerable attention due to their enhanced capacity compared to their radio frequency (RF) counterparts. This paper analyzes the performance of physical layer security of space-to-ground intensity modulation/direct detection FSO satellite links under the effect of atmospheric loss, misalignment, cloud attenuatio&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.13554v1-abstract-full').style.display = 'inline'; document.getElementById('2402.13554v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.13554v1-abstract-full" style="display: none;"> Free-space optics (FSO)-based satellite communication systems have recently received considerable attention due to their enhanced capacity compared to their radio frequency (RF) counterparts. This paper analyzes the performance of physical layer security of space-to-ground intensity modulation/direct detection FSO satellite links under the effect of atmospheric loss, misalignment, cloud attenuation, and atmospheric turbulence-induced fading. Specifically, a wiretap channel consisting of a legitimate transmitter Alice (i.e., the satellite), a legitimate user Bob, and an eavesdropper Eve over turbulence channels modeled by the Fisher-Snedecor $\mathcal{F}$ distribution is considered. The secrecy performance in terms of the average secrecy capacity, secrecy outage probability, and strictly positive secrecy capacity are derived in closed-form. Simulation results reveal significant impacts of satellite altitude, zenith angle, and turbulence strength on the secrecy performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.13554v1-abstract-full').style.display = 'none'; document.getElementById('2402.13554v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.13549">arXiv:2402.13549</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2402.13549">pdf</a>, <a href="https://arxiv.org/ps/2402.13549">ps</a>, <a href="https://arxiv.org/format/2402.13549">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Q-learning-based Joint Design of Adaptive Modulation and Precoding for Physical Layer Security in Visible Light Communications </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Hoang%2C+D+M+T">Duc M. T. Hoang</a>, <a href="/search/cs?searchtype=author&amp;query=Pham%2C+T+V">Thanh V. Pham</a>, <a href="/search/cs?searchtype=author&amp;query=Pham%2C+A+T">Anh T. Pham</a>, <a href="/search/cs?searchtype=author&amp;query=Nguyen%2C+C+T">Chuyen T Nguyen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.13549v1-abstract-short" style="display: inline;"> There has been an increasing interest in physical layer security (PLS), which, compared with conventional cryptography, offers a unique approach to guaranteeing information confidentiality against eavesdroppers. In this paper, we study a joint design of adaptive $M$-ary pulse amplitude modulation (PAM) and precoding, which aims to optimize wiretap visible-light channels&#39; secrecy capacity and bit e&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.13549v1-abstract-full').style.display = 'inline'; document.getElementById('2402.13549v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.13549v1-abstract-full" style="display: none;"> There has been an increasing interest in physical layer security (PLS), which, compared with conventional cryptography, offers a unique approach to guaranteeing information confidentiality against eavesdroppers. In this paper, we study a joint design of adaptive $M$-ary pulse amplitude modulation (PAM) and precoding, which aims to optimize wiretap visible-light channels&#39; secrecy capacity and bit error rate (BER) performances. The proposed design is motivated by higher-order modulation, which results in better secrecy capacity at the expense of a higher BER. On the other hand, a proper precoding design, which can manipulate the received signal quality at the legitimate user and the eavesdropper, can also enhance secrecy performance and influence the BER. A reward function that considers the secrecy capacity and the BERs of the legitimate user&#39;s (Bob) and the eavesdropper&#39;s (Eve) channels is introduced and maximized. Due to the non-linearity and complexity of the reward function, it is challenging to solve the optical design using classical optimization techniques. Therefore, reinforcement learning-based designs using Q-learning and Deep Q-learning are proposed to maximize the reward function. Simulation results verify that compared with the baseline designs, the proposed joint designs achieve better reward values while maintaining the BER of Bob&#39;s channel (Eve&#39;s channel) well below (above) the pre-FEC (forward error correction) BER threshold. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.13549v1-abstract-full').style.display = 'none'; document.getElementById('2402.13549v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.13454">arXiv:2402.13454</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2402.13454">pdf</a>, <a href="https://arxiv.org/format/2402.13454">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> </div> </div> <p class="title is-5 mathjax"> Theoretical Analysis of Submodular Information Measures for Targeted Data Subset Selection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Beck%2C+N">Nathan Beck</a>, <a href="/search/cs?searchtype=author&amp;query=Pham%2C+T">Truong Pham</a>, <a href="/search/cs?searchtype=author&amp;query=Iyer%2C+R">Rishabh Iyer</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.13454v2-abstract-short" style="display: inline;"> With increasing volume of data being used across machine learning tasks, the capability to target specific subsets of data becomes more important. To aid in this capability, the recently proposed Submodular Mutual Information (SMI) has been effectively applied across numerous tasks in literature to perform targeted subset selection with the aid of a exemplar query set. However, all such works are&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.13454v2-abstract-full').style.display = 'inline'; document.getElementById('2402.13454v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.13454v2-abstract-full" style="display: none;"> With increasing volume of data being used across machine learning tasks, the capability to target specific subsets of data becomes more important. To aid in this capability, the recently proposed Submodular Mutual Information (SMI) has been effectively applied across numerous tasks in literature to perform targeted subset selection with the aid of a exemplar query set. However, all such works are deficient in providing theoretical guarantees for SMI in terms of its sensitivity to a subset&#39;s relevance and coverage of the targeted data. For the first time, we provide such guarantees by deriving similarity-based bounds on quantities related to relevance and coverage of the targeted data. With these bounds, we show that the SMI functions, which have empirically shown success in multiple applications, are theoretically sound in achieving good query relevance and query coverage. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.13454v2-abstract-full').style.display = 'none'; document.getElementById('2402.13454v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">16 pages, 10 figures. This work has been submitted to the IEEE for possible publication</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.06226">arXiv:2402.06226</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2402.06226">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> N-1 Reduced Optimal Power Flow Using Augmented Hierarchical Graph Neural Network </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Pham%2C+T">Thuan Pham</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+X">Xingpeng Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.06226v1-abstract-short" style="display: inline;"> Optimal power flow (OPF) is used to perform generation redispatch in power system real-time operations. N-1 OPF can ensure safe grid operations under diverse contingency scenarios. For large and intricate power networks with numerous variables and constraints, achieving an optimal solution for real-time N-1 OPF necessitates substantial computational resources. To mitigate this challenge, machine l&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.06226v1-abstract-full').style.display = 'inline'; document.getElementById('2402.06226v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.06226v1-abstract-full" style="display: none;"> Optimal power flow (OPF) is used to perform generation redispatch in power system real-time operations. N-1 OPF can ensure safe grid operations under diverse contingency scenarios. For large and intricate power networks with numerous variables and constraints, achieving an optimal solution for real-time N-1 OPF necessitates substantial computational resources. To mitigate this challenge, machine learning (ML) is introduced as an additional tool for predicting congested or heavily loaded lines dynamically. In this paper, an advanced ML model known as the augmented hierarchical graph neural network (AHGNN) was proposed to predict critical congested lines and create N-1 reduced OPF (N-1 ROPF). The proposed AHGNN-enabled N-1 ROPF can result in a remarkable reduction in computing time while retaining the solution quality. Several variations of GNN-based ML models are also implemented as benchmark to demonstrate effectiveness of the proposed AHGNN approach. Case studies prove the proposed AHGNN and the associated N-1 ROPF are highly effective in reducing computation time while preserving solution quality, highlighting the promising potential of ML, particularly GNN in enhancing power system operations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.06226v1-abstract-full').style.display = 'none'; document.getElementById('2402.06226v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Pham%2C+T&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Pham%2C+T&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Pham%2C+T&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Pham%2C+T&amp;start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Pham%2C+T&amp;start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Pham%2C+T&amp;start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Pham%2C+T&amp;start=250" class="pagination-link " aria-label="Page 6" aria-current="page">6 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Pham%2C+T&amp;start=300" class="pagination-link " aria-label="Page 7" aria-current="page">7 </a> </li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

Pages: 1 2 3 4 5 6 7 8 9 10