CINXE.COM
Search | arXiv e-print repository
<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 342 results for author: <span class="mathjax">Saha, S</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Saha%2C+S">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Saha, S"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Saha%2C+S&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Saha, S"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Saha%2C+S&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Saha%2C+S&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Saha%2C+S&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Saha%2C+S&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Saha%2C+S&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Saha%2C+S&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li> <a href="/search/?searchtype=author&query=Saha%2C+S&start=250" class="pagination-link " aria-label="Page 6" aria-current="page">6 </a> </li> <li> <a href="/search/?searchtype=author&query=Saha%2C+S&start=300" class="pagination-link " aria-label="Page 7" aria-current="page">7 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05378">arXiv:2411.05378</a> <span> [<a href="https://arxiv.org/pdf/2411.05378">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Machine learning for prediction of dose-volume histograms of organs-at-risk in prostate cancer from simple structure volume parameters </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Saha%2C+S">Saheli Saha</a>, <a href="/search/cs?searchtype=author&query=Banerjee%2C+D">Debasmita Banerjee</a>, <a href="/search/cs?searchtype=author&query=Ram%2C+R">Rishi Ram</a>, <a href="/search/cs?searchtype=author&query=Reddy%2C+G">Gowtham Reddy</a>, <a href="/search/cs?searchtype=author&query=Guha%2C+D">Debashree Guha</a>, <a href="/search/cs?searchtype=author&query=Sarkar%2C+A">Arnab Sarkar</a>, <a href="/search/cs?searchtype=author&query=Dutta%2C+B">Bapi Dutta</a>, <a href="/search/cs?searchtype=author&query=S%2C+M+A">Moses ArunSingh S</a>, <a href="/search/cs?searchtype=author&query=Chakraborty%2C+S">Suman Chakraborty</a>, <a href="/search/cs?searchtype=author&query=Mallick%2C+I">Indranil Mallick</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05378v1-abstract-short" style="display: inline;"> Dose prediction is an area of ongoing research that facilitates radiotherapy planning. Most commercial models utilise imaging data and intense computing resources. This study aimed to predict the dose-volume of rectum and bladder from volumes of target, at-risk structure organs and their overlap regions using machine learning. Dose-volume information of 94 patients with prostate cancer planned for… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05378v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05378v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05378v1-abstract-full" style="display: none;"> Dose prediction is an area of ongoing research that facilitates radiotherapy planning. Most commercial models utilise imaging data and intense computing resources. This study aimed to predict the dose-volume of rectum and bladder from volumes of target, at-risk structure organs and their overlap regions using machine learning. Dose-volume information of 94 patients with prostate cancer planned for 6000cGy in 20 fractions was exported from the treatment planning system as text files and mined to create a training dataset. Several statistical modelling, machine learning methods, and a new fuzzy rule-based prediction (FRBP) model were explored and validated on an independent dataset of 39 patients. The median absolute error was 2.0%-3.7% for bladder and 1.7-2.4% for rectum in the 4000-6420cGy range. For 5300cGy, 5600cGy and 6000cGy, the median difference was less than 2.5% for rectum and 3.8% for bladder. The FRBP model produced errors of 1.2%, 1.3%, 0.9% and 1.6%, 1.2%, 0.1% for the rectum and bladder respectively at these dose levels. These findings indicate feasibility of obtaining accurate predictions of the clinically important dose-volume parameters for rectum and bladder using just the volumes of these structures. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05378v1-abstract-full').style.display = 'none'; document.getElementById('2411.05378v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03223">arXiv:2411.03223</a> <span> [<a href="https://arxiv.org/pdf/2411.03223">pdf</a>, <a href="https://arxiv.org/format/2411.03223">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Beyond Grid Data: Exploring Graph Neural Networks for Earth Observation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Shan Zhao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhaiyu Chen</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+Z">Zhitong Xiong</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yilei Shi</a>, <a href="/search/cs?searchtype=author&query=Saha%2C+S">Sudipan Saha</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+X+X">Xiao Xiang Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03223v2-abstract-short" style="display: inline;"> Earth Observation (EO) data analysis has been significantly revolutionized by deep learning (DL), with applications typically limited to grid-like data structures. Graph Neural Networks (GNNs) emerge as an important innovation, propelling DL into the non-Euclidean domain. Naturally, GNNs can effectively tackle the challenges posed by diverse modalities, multiple sensors, and the heterogeneous natu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03223v2-abstract-full').style.display = 'inline'; document.getElementById('2411.03223v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03223v2-abstract-full" style="display: none;"> Earth Observation (EO) data analysis has been significantly revolutionized by deep learning (DL), with applications typically limited to grid-like data structures. Graph Neural Networks (GNNs) emerge as an important innovation, propelling DL into the non-Euclidean domain. Naturally, GNNs can effectively tackle the challenges posed by diverse modalities, multiple sensors, and the heterogeneous nature of EO data. To introduce GNNs in the related domains, our review begins by offering fundamental knowledge on GNNs. Then, we summarize the generic problems in EO, to which GNNs can offer potential solutions. Following this, we explore a broad spectrum of GNNs' applications to scientific problems in Earth systems, covering areas such as weather and climate analysis, disaster management, air quality monitoring, agriculture, land cover classification, hydrological process modeling, and urban modeling. The rationale behind adopting GNNs in these fields is explained, alongside methodologies for organizing graphs and designing favorable architectures for various tasks. Furthermore, we highlight methodological challenges of implementing GNNs in these domains and possible solutions that could guide future research. While acknowledging that GNNs are not a universal solution, we conclude the paper by comparing them with other popular architectures like transformers and analyzing their potential synergies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03223v2-abstract-full').style.display = 'none'; document.getElementById('2411.03223v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted for publication in Geoscience and Remote Sensing Magazine (GRSM)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00762">arXiv:2411.00762</a> <span> [<a href="https://arxiv.org/pdf/2411.00762">pdf</a>, <a href="https://arxiv.org/format/2411.00762">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Face Anonymization Made Simple </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kung%2C+H">Han-Wei Kung</a>, <a href="/search/cs?searchtype=author&query=Varanka%2C+T">Tuomas Varanka</a>, <a href="/search/cs?searchtype=author&query=Saha%2C+S">Sanjay Saha</a>, <a href="/search/cs?searchtype=author&query=Sim%2C+T">Terence Sim</a>, <a href="/search/cs?searchtype=author&query=Sebe%2C+N">Nicu Sebe</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00762v1-abstract-short" style="display: inline;"> Current face anonymization techniques often depend on identity loss calculated by face recognition models, which can be inaccurate and unreliable. Additionally, many methods require supplementary data such as facial landmarks and masks to guide the synthesis process. In contrast, our approach uses diffusion models with only a reconstruction loss, eliminating the need for facial landmarks or masks… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00762v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00762v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00762v1-abstract-full" style="display: none;"> Current face anonymization techniques often depend on identity loss calculated by face recognition models, which can be inaccurate and unreliable. Additionally, many methods require supplementary data such as facial landmarks and masks to guide the synthesis process. In contrast, our approach uses diffusion models with only a reconstruction loss, eliminating the need for facial landmarks or masks while still producing images with intricate, fine-grained details. We validated our results on two public benchmarks through both quantitative and qualitative evaluations. Our model achieves state-of-the-art performance in three key areas: identity anonymization, facial attribute preservation, and image quality. Beyond its primary function of anonymization, our model can also perform face swapping tasks by incorporating an additional facial image as input, demonstrating its versatility and potential for diverse applications. Our code and models are available at https://github.com/hanweikung/face_anon_simple . <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00762v1-abstract-full').style.display = 'none'; document.getElementById('2411.00762v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.21170">arXiv:2410.21170</a> <span> [<a href="https://arxiv.org/pdf/2410.21170">pdf</a>, <a href="https://arxiv.org/format/2410.21170">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Joint Audio-Visual Idling Vehicle Detection with Streamlined Input Dependencies </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiwen Li</a>, <a href="/search/cs?searchtype=author&query=Mohammed%2C+R">Rehman Mohammed</a>, <a href="/search/cs?searchtype=author&query=Mangin%2C+T">Tristalee Mangin</a>, <a href="/search/cs?searchtype=author&query=Saha%2C+S">Surojit Saha</a>, <a href="/search/cs?searchtype=author&query=Whitaker%2C+R+T">Ross T Whitaker</a>, <a href="/search/cs?searchtype=author&query=Kelly%2C+K+E">Kerry E. Kelly</a>, <a href="/search/cs?searchtype=author&query=Tasdizen%2C+T">Tolga Tasdizen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.21170v1-abstract-short" style="display: inline;"> Idling vehicle detection (IVD) can be helpful in monitoring and reducing unnecessary idling and can be integrated into real-time systems to address the resulting pollution and harmful products. The previous approach [13], a non-end-to-end model, requires extra user clicks to specify a part of the input, making system deployment more error-prone or even not feasible. In contrast, we introduce an en… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21170v1-abstract-full').style.display = 'inline'; document.getElementById('2410.21170v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.21170v1-abstract-full" style="display: none;"> Idling vehicle detection (IVD) can be helpful in monitoring and reducing unnecessary idling and can be integrated into real-time systems to address the resulting pollution and harmful products. The previous approach [13], a non-end-to-end model, requires extra user clicks to specify a part of the input, making system deployment more error-prone or even not feasible. In contrast, we introduce an end-to-end joint audio-visual IVD task designed to detect vehicles visually under three states: moving, idling and engine off. Unlike feature co-occurrence task such as audio-visual vehicle tracking, our IVD task addresses complementary features, where labels cannot be determined by a single modality alone. To this end, we propose AVIVD-Net, a novel network that integrates audio and visual features through a bidirectional attention mechanism. AVIVD-Net streamlines the input process by learning a joint feature space, reducing the deployment complexity of previous methods. Additionally, we introduce the AVIVD dataset, which is seven times larger than previous datasets, offering significantly more annotated samples to study the IVD problem. Our model achieves performance comparable to prior approaches, making it suitable for automated deployment. Furthermore, by evaluating AVIVDNet on the feature co-occurrence public dataset MAVD [23], we demonstrate its potential for extension to self-driving vehicle video-camera setups. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21170v1-abstract-full').style.display = 'none'; document.getElementById('2410.21170v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.14031">arXiv:2410.14031</a> <span> [<a href="https://arxiv.org/pdf/2410.14031">pdf</a>, <a href="https://arxiv.org/format/2410.14031">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Modeling the Human Visual System: Comparative Insights from Response-Optimized and Task-Optimized Vision Models, Language Models, and different Readout Mechanisms </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Saha%2C+S">Shreya Saha</a>, <a href="/search/cs?searchtype=author&query=Chadha%2C+I">Ishaan Chadha</a>, <a href="/search/cs?searchtype=author&query=khosla%2C+M">Meenakshi khosla</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.14031v1-abstract-short" style="display: inline;"> Over the past decade, predictive modeling of neural responses in the primate visual system has advanced significantly, largely driven by various DNN approaches. These include models optimized directly for visual recognition, cross-modal alignment through contrastive objectives, neural response prediction from scratch, and large language model embeddings.Likewise, different readout mechanisms, rang… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14031v1-abstract-full').style.display = 'inline'; document.getElementById('2410.14031v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.14031v1-abstract-full" style="display: none;"> Over the past decade, predictive modeling of neural responses in the primate visual system has advanced significantly, largely driven by various DNN approaches. These include models optimized directly for visual recognition, cross-modal alignment through contrastive objectives, neural response prediction from scratch, and large language model embeddings.Likewise, different readout mechanisms, ranging from fully linear to spatial-feature factorized methods have been explored for mapping network activations to neural responses. Despite the diversity of these approaches, it remains unclear which method performs best across different visual regions. In this study, we systematically compare these approaches for modeling the human visual system and investigate alternative strategies to improve response predictions. Our findings reveal that for early to mid-level visual areas, response-optimized models with visual inputs offer superior prediction accuracy, while for higher visual regions, embeddings from LLMs based on detailed contextual descriptions of images and task-optimized models pretrained on large vision datasets provide the best fit. Through comparative analysis of these modeling approaches, we identified three distinct regions in the visual cortex: one sensitive primarily to perceptual features of the input that are not captured by linguistic descriptions, another attuned to fine-grained visual details representing semantic information, and a third responsive to abstract, global meanings aligned with linguistic content. We also highlight the critical role of readout mechanisms, proposing a novel scheme that modulates receptive fields and feature maps based on semantic content, resulting in an accuracy boost of 3-23% over existing SOTAs for all models and brain regions. Together, these findings offer key insights into building more precise models of the visual system. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14031v1-abstract-full').style.display = 'none'; document.getElementById('2410.14031v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.01599">arXiv:2410.01599</a> <span> [<a href="https://arxiv.org/pdf/2410.01599">pdf</a>, <a href="https://arxiv.org/format/2410.01599">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Numerical Analysis">math.NA</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Towards Model Discovery Using Domain Decomposition and PINNs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Saha%2C+T+S">Tirtho S. Saha</a>, <a href="/search/cs?searchtype=author&query=Heinlein%2C+A">Alexander Heinlein</a>, <a href="/search/cs?searchtype=author&query=Reisch%2C+C">Cordula Reisch</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.01599v1-abstract-short" style="display: inline;"> We enhance machine learning algorithms for learning model parameters in complex systems represented by ordinary differential equations (ODEs) with domain decomposition methods. The study evaluates the performance of two approaches, namely (vanilla) Physics-Informed Neural Networks (PINNs) and Finite Basis Physics-Informed Neural Networks (FBPINNs), in learning the dynamics of test models with a qu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01599v1-abstract-full').style.display = 'inline'; document.getElementById('2410.01599v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.01599v1-abstract-full" style="display: none;"> We enhance machine learning algorithms for learning model parameters in complex systems represented by ordinary differential equations (ODEs) with domain decomposition methods. The study evaluates the performance of two approaches, namely (vanilla) Physics-Informed Neural Networks (PINNs) and Finite Basis Physics-Informed Neural Networks (FBPINNs), in learning the dynamics of test models with a quasi-stationary longtime behavior. We test the approaches for data sets in different dynamical regions and with varying noise level. As results, we find a better performance for the FBPINN approach compared to the vanilla PINN approach, even in cases with data from only a quasi-stationary time domain with few dynamics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01599v1-abstract-full').style.display = 'none'; document.getElementById('2410.01599v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68T07; 65M55; 92-08 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.15356">arXiv:2409.15356</a> <span> [<a href="https://arxiv.org/pdf/2409.15356">pdf</a>, <a href="https://arxiv.org/ps/2409.15356">ps</a>, <a href="https://arxiv.org/format/2409.15356">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> TCG CREST System Description for the Second DISPLACE Challenge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Raghav%2C+N">Nikhil Raghav</a>, <a href="/search/cs?searchtype=author&query=Saha%2C+S">Subhajit Saha</a>, <a href="/search/cs?searchtype=author&query=Sahidullah%2C+M">Md Sahidullah</a>, <a href="/search/cs?searchtype=author&query=Das%2C+S">Swagatam Das</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.15356v1-abstract-short" style="display: inline;"> In this report, we describe the speaker diarization (SD) and language diarization (LD) systems developed by our team for the Second DISPLACE Challenge, 2024. Our contributions were dedicated to Track 1 for SD and Track 2 for LD in multilingual and multi-speaker scenarios. We investigated different speech enhancement techniques, voice activity detection (VAD) techniques, unsupervised domain categor… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15356v1-abstract-full').style.display = 'inline'; document.getElementById('2409.15356v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.15356v1-abstract-full" style="display: none;"> In this report, we describe the speaker diarization (SD) and language diarization (LD) systems developed by our team for the Second DISPLACE Challenge, 2024. Our contributions were dedicated to Track 1 for SD and Track 2 for LD in multilingual and multi-speaker scenarios. We investigated different speech enhancement techniques, voice activity detection (VAD) techniques, unsupervised domain categorization, and neural embedding extraction architectures. We also exploited the fusion of various embedding extraction models. We implemented our system with the open-source SpeechBrain toolkit. Our final submissions use spectral clustering for both the speaker and language diarization. We achieve about $7\%$ relative improvement over the challenge baseline in Track 1. We did not obtain improvement over the challenge baseline in Track 2. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15356v1-abstract-full').style.display = 'none'; document.getElementById('2409.15356v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.15226">arXiv:2409.15226</a> <span> [<a href="https://arxiv.org/pdf/2409.15226">pdf</a>, <a href="https://arxiv.org/format/2409.15226">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Intelligent Routing Algorithm over SDN: Reusable Reinforcement Learning Approach </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wumian%2C+W">Wang Wumian</a>, <a href="/search/cs?searchtype=author&query=Saha%2C+S">Sajal Saha</a>, <a href="/search/cs?searchtype=author&query=Haque%2C+A">Anwar Haque</a>, <a href="/search/cs?searchtype=author&query=Sidebottom%2C+G">Greg Sidebottom</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.15226v1-abstract-short" style="display: inline;"> Traffic routing is vital for the proper functioning of the Internet. As users and network traffic increase, researchers try to develop adaptive and intelligent routing algorithms that can fulfill various QoS requirements. Reinforcement Learning (RL) based routing algorithms have shown better performance than traditional approaches. We developed a QoS-aware, reusable RL routing algorithm, RLSR-Rout… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15226v1-abstract-full').style.display = 'inline'; document.getElementById('2409.15226v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.15226v1-abstract-full" style="display: none;"> Traffic routing is vital for the proper functioning of the Internet. As users and network traffic increase, researchers try to develop adaptive and intelligent routing algorithms that can fulfill various QoS requirements. Reinforcement Learning (RL) based routing algorithms have shown better performance than traditional approaches. We developed a QoS-aware, reusable RL routing algorithm, RLSR-Routing over SDN. During the learning process, our algorithm ensures loop-free path exploration. While finding the path for one traffic demand (a source destination pair with certain amount of traffic), RLSR-Routing learns the overall network QoS status, which can be used to speed up algorithm convergence when finding the path for other traffic demands. By adapting Segment Routing, our algorithm can achieve flow-based, source packet routing, and reduce communications required between SDN controller and network plane. Our algorithm shows better performance in terms of load balancing than the traditional approaches. It also has faster convergence than the non-reusable RL approach when finding paths for multiple traffic demands. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15226v1-abstract-full').style.display = 'none'; document.getElementById('2409.15226v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">19 pages, 11 figures, Submitted to Elsevier Journal of Computer Network</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.15028">arXiv:2409.15028</a> <span> [<a href="https://arxiv.org/pdf/2409.15028">pdf</a>, <a href="https://arxiv.org/format/2409.15028">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Region Mixup </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Saha%2C+S">Saptarshi Saha</a>, <a href="/search/cs?searchtype=author&query=Garain%2C+U">Utpal Garain</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.15028v1-abstract-short" style="display: inline;"> This paper introduces a simple extension of mixup (Zhang et al., 2018) data augmentation to enhance generalization in visual recognition tasks. Unlike the vanilla mixup method, which blends entire images, our approach focuses on combining regions from multiple images. </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.15028v1-abstract-full" style="display: none;"> This paper introduces a simple extension of mixup (Zhang et al., 2018) data augmentation to enhance generalization in visual recognition tasks. Unlike the vanilla mixup method, which blends entire images, our approach focuses on combining regions from multiple images. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15028v1-abstract-full').style.display = 'none'; document.getElementById('2409.15028v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published as a Tiny Paper at ICLR 2024</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.10 </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> The Second Tiny Papers Track at ICLR 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.13181">arXiv:2409.13181</a> <span> [<a href="https://arxiv.org/pdf/2409.13181">pdf</a>, <a href="https://arxiv.org/format/2409.13181">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Overcoming Data Limitations in Internet Traffic Forecasting: LSTM Models with Transfer Learning and Wavelet Augmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Saha%2C+S">Sajal Saha</a>, <a href="/search/cs?searchtype=author&query=Haque%2C+A">Anwar Haque</a>, <a href="/search/cs?searchtype=author&query=Sidebottom%2C+G">Greg Sidebottom</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.13181v1-abstract-short" style="display: inline;"> Effective internet traffic prediction in smaller ISP networks is challenged by limited data availability. This paper explores this issue using transfer learning and data augmentation techniques with two LSTM-based models, LSTMSeq2Seq and LSTMSeq2SeqAtn, initially trained on a comprehensive dataset provided by Juniper Networks and subsequently applied to smaller datasets. The datasets represent rea… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13181v1-abstract-full').style.display = 'inline'; document.getElementById('2409.13181v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.13181v1-abstract-full" style="display: none;"> Effective internet traffic prediction in smaller ISP networks is challenged by limited data availability. This paper explores this issue using transfer learning and data augmentation techniques with two LSTM-based models, LSTMSeq2Seq and LSTMSeq2SeqAtn, initially trained on a comprehensive dataset provided by Juniper Networks and subsequently applied to smaller datasets. The datasets represent real internet traffic telemetry, offering insights into diverse traffic patterns across different network domains. Our study revealed that while both models performed well in single-step predictions, multi-step forecasts were challenging, particularly in terms of long-term accuracy. In smaller datasets, LSTMSeq2Seq generally outperformed LSTMSeq2SeqAtn, indicating that higher model complexity does not necessarily translate to better performance. The models' effectiveness varied across different network domains, reflecting the influence of distinct traffic characteristics. To address data scarcity, Discrete Wavelet Transform was used for data augmentation, leading to significant improvements in model performance, especially in shorter-term forecasts. Our analysis showed that data augmentation is crucial in scenarios with limited data. Additionally, the study included an analysis of the models' variability and consistency, with attention mechanisms in LSTMSeq2SeqAtn providing better short-term forecasting consistency but greater variability in longer forecasts. The results highlight the benefits and limitations of different modeling approaches in traffic prediction. Overall, this research underscores the importance of transfer learning and data augmentation in enhancing the accuracy of traffic prediction models, particularly in smaller ISP networks with limited data availability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13181v1-abstract-full').style.display = 'none'; document.getElementById('2409.13181v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">16 pages, 7 Figures, Submitted to Elsevier Journal of Computer Communication</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.13179">arXiv:2409.13179</a> <span> [<a href="https://arxiv.org/pdf/2409.13179">pdf</a>, <a href="https://arxiv.org/format/2409.13179">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> ConvLSTMTransNet: A Hybrid Deep Learning Approach for Internet Traffic Telemetry </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Saha%2C+S">Sajal Saha</a>, <a href="/search/cs?searchtype=author&query=Das%2C+S">Saikat Das</a>, <a href="/search/cs?searchtype=author&query=Carvalho%2C+G+H+S">Glaucio H. S. Carvalho</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.13179v1-abstract-short" style="display: inline;"> In this paper, we present a novel hybrid deep learning model, named ConvLSTMTransNet, designed for time series prediction, with a specific application to internet traffic telemetry. This model integrates the strengths of Convolutional Neural Networks (CNNs), Long Short-Term Memory (LSTM) networks, and Transformer encoders to capture complex spatial-temporal relationships inherent in time series da… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13179v1-abstract-full').style.display = 'inline'; document.getElementById('2409.13179v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.13179v1-abstract-full" style="display: none;"> In this paper, we present a novel hybrid deep learning model, named ConvLSTMTransNet, designed for time series prediction, with a specific application to internet traffic telemetry. This model integrates the strengths of Convolutional Neural Networks (CNNs), Long Short-Term Memory (LSTM) networks, and Transformer encoders to capture complex spatial-temporal relationships inherent in time series data. The ConvLSTMTransNet model was evaluated against three baseline models: RNN, LSTM, and Gated Recurrent Unit (GRU), using real internet traffic data sampled from high-speed ports on a provider edge router. Performance metrics such as Mean Absolute Error (MAE), Root Mean Squared Error (RMSE), and Weighted Absolute Percentage Error (WAPE) were used to assess each model's accuracy. Our findings demonstrate that ConvLSTMTransNet significantly outperforms the baseline models by approximately 10% in terms of prediction accuracy. ConvLSTMTransNet surpasses traditional models due to its innovative architectural features, which enhance its ability to capture temporal dependencies and extract spatial features from internet traffic data. Overall, these findings underscore the importance of employing advanced architectures tailored to the complexities of internet traffic data for achieving more precise predictions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13179v1-abstract-full').style.display = 'none'; document.getElementById('2409.13179v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">6 pages, 1 figure, Submitted to IEEE Virtual Conference on Communications 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.13177">arXiv:2409.13177</a> <span> [<a href="https://arxiv.org/pdf/2409.13177">pdf</a>, <a href="https://arxiv.org/format/2409.13177">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> An Adaptive End-to-End IoT Security Framework Using Explainable AI and LLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Baral%2C+S">Sudipto Baral</a>, <a href="/search/cs?searchtype=author&query=Saha%2C+S">Sajal Saha</a>, <a href="/search/cs?searchtype=author&query=Haque%2C+A">Anwar Haque</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.13177v1-abstract-short" style="display: inline;"> The exponential growth of the Internet of Things (IoT) has significantly increased the complexity and volume of cybersecurity threats, necessitating the development of advanced, scalable, and interpretable security frameworks. This paper presents an innovative, comprehensive framework for real-time IoT attack detection and response that leverages Machine Learning (ML), Explainable AI (XAI), and La… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13177v1-abstract-full').style.display = 'inline'; document.getElementById('2409.13177v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.13177v1-abstract-full" style="display: none;"> The exponential growth of the Internet of Things (IoT) has significantly increased the complexity and volume of cybersecurity threats, necessitating the development of advanced, scalable, and interpretable security frameworks. This paper presents an innovative, comprehensive framework for real-time IoT attack detection and response that leverages Machine Learning (ML), Explainable AI (XAI), and Large Language Models (LLM). By integrating XAI techniques such as SHAP (SHapley Additive exPlanations) and LIME (Local Interpretable Model-agnostic Explanations) with a model-independent architecture, we ensure our framework's adaptability across various ML algorithms. Additionally, the incorporation of LLMs enhances the interpretability and accessibility of detection decisions, providing system administrators with actionable, human-understandable explanations of detected threats. Our end-to-end framework not only facilitates a seamless transition from model development to deployment but also represents a real-world application capability that is often lacking in existing research. Based on our experiments with the CIC-IOT-2023 dataset \cite{neto2023ciciot2023}, Gemini and OPENAI LLMS demonstrate unique strengths in attack mitigation: Gemini offers precise, focused strategies, while OPENAI provides extensive, in-depth security measures. Incorporating SHAP and LIME algorithms within XAI provides comprehensive insights into attack detection, emphasizing opportunities for model improvement through detailed feature analysis, fine-tuning, and the adaptation of misclassifications to enhance accuracy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13177v1-abstract-full').style.display = 'none'; document.getElementById('2409.13177v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">6 pages, 1 figure, Accepted in 2024 IEEE WF-IoT Conference</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.12147">arXiv:2409.12147</a> <span> [<a href="https://arxiv.org/pdf/2409.12147">pdf</a>, <a href="https://arxiv.org/format/2409.12147">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> MAgICoRe: Multi-Agent, Iterative, Coarse-to-Fine Refinement for Reasoning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+J+C">Justin Chih-Yao Chen</a>, <a href="/search/cs?searchtype=author&query=Prasad%2C+A">Archiki Prasad</a>, <a href="/search/cs?searchtype=author&query=Saha%2C+S">Swarnadeep Saha</a>, <a href="/search/cs?searchtype=author&query=Stengel-Eskin%2C+E">Elias Stengel-Eskin</a>, <a href="/search/cs?searchtype=author&query=Bansal%2C+M">Mohit Bansal</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.12147v1-abstract-short" style="display: inline;"> Large Language Models' (LLM) reasoning can be improved using test-time aggregation strategies, i.e., generating multiple samples and voting among generated samples. While these improve performance, they often reach a saturation point. Refinement offers an alternative by using LLM-generated feedback to improve solution quality. However, refinement introduces 3 key challenges: (1) Excessive refineme… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12147v1-abstract-full').style.display = 'inline'; document.getElementById('2409.12147v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.12147v1-abstract-full" style="display: none;"> Large Language Models' (LLM) reasoning can be improved using test-time aggregation strategies, i.e., generating multiple samples and voting among generated samples. While these improve performance, they often reach a saturation point. Refinement offers an alternative by using LLM-generated feedback to improve solution quality. However, refinement introduces 3 key challenges: (1) Excessive refinement: Uniformly refining all instances can over-correct and reduce the overall performance. (2) Inability to localize and address errors: LLMs have a limited ability to self-correct and struggle to identify and correct their own mistakes. (3) Insufficient refinement: Deciding how many iterations of refinement are needed is non-trivial, and stopping too soon could leave errors unaddressed. To tackle these issues, we propose MAgICoRe, which avoids excessive refinement by categorizing problem difficulty as easy or hard, solving easy problems with coarse-grained aggregation and hard ones with fine-grained and iterative multi-agent refinement. To improve error localization, we incorporate external step-wise reward model (RM) scores. Moreover, to ensure effective refinement, we employ a multi-agent loop with three agents: Solver, Reviewer (which generates targeted feedback based on step-wise RM scores), and the Refiner (which incorporates feedback). To ensure sufficient refinement, we re-evaluate updated solutions, iteratively initiating further rounds of refinement. We evaluate MAgICoRe on Llama-3-8B and GPT-3.5 and show its effectiveness across 5 math datasets. Even one iteration of MAgICoRe beats Self-Consistency by 3.4%, Best-of-k by 3.2%, and Self-Refine by 4.0% while using less than half the samples. Unlike iterative refinement with baselines, MAgICoRe continues to improve with more iterations. Finally, our ablations highlight the importance of MAgICoRe's RMs and multi-agent communication. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12147v1-abstract-full').style.display = 'none'; document.getElementById('2409.12147v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">22 pages, code: https://github.com/dinobby/MAgICoRe</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.05876">arXiv:2409.05876</a> <span> [<a href="https://arxiv.org/pdf/2409.05876">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Exploring and Visualizing COVID-19 Trends in India: Vulnerabilities and Mitigation Strategies </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Saha%2C+S">Swayamjit Saha</a>, <a href="/search/cs?searchtype=author&query=Ghosh%2C+K">Kuntal Ghosh</a>, <a href="/search/cs?searchtype=author&query=Chatterjee%2C+G">Garga Chatterjee</a>, <a href="/search/cs?searchtype=author&query=Swan%2C+J+E">J. Edward Swan II</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.05876v1-abstract-short" style="display: inline;"> Visualizing data plays a pivotal role in portraying important scientific information. Hence, visualization techniques aid in displaying relevant graphical interpretations from the varied structures of data, which is found otherwise. In this paper, we explore the COVID-19 pandemic influence trends in the subcontinent of India in the context of how far the infection rate spiked in the year 2020 and… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.05876v1-abstract-full').style.display = 'inline'; document.getElementById('2409.05876v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.05876v1-abstract-full" style="display: none;"> Visualizing data plays a pivotal role in portraying important scientific information. Hence, visualization techniques aid in displaying relevant graphical interpretations from the varied structures of data, which is found otherwise. In this paper, we explore the COVID-19 pandemic influence trends in the subcontinent of India in the context of how far the infection rate spiked in the year 2020 and how the public health division of the country India has helped to curb the spread of the novel virus by installing vaccination centers across the diaspora of the country. The paper contributes to the empirical study of understanding the impact caused by the novel virus to the country by doing extensive explanatory data analysis of the data collected from the official government portal. Our work contributes to the understanding that data visualization is prime in understanding public health problems and beyond and taking necessary measures to curb the existing pandemic. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.05876v1-abstract-full').style.display = 'none'; document.getElementById('2409.05876v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">6 pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.12023">arXiv:2408.12023</a> <span> [<a href="https://arxiv.org/pdf/2408.12023">pdf</a>, <a href="https://arxiv.org/format/2408.12023">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Limitations in Employing Natural Language Supervision for Sensor-Based Human Activity Recognition -- And Ways to Overcome Them </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Haresamudram%2C+H">Harish Haresamudram</a>, <a href="/search/cs?searchtype=author&query=Beedu%2C+A">Apoorva Beedu</a>, <a href="/search/cs?searchtype=author&query=Rabbi%2C+M">Mashfiqui Rabbi</a>, <a href="/search/cs?searchtype=author&query=Saha%2C+S">Sankalita Saha</a>, <a href="/search/cs?searchtype=author&query=Essa%2C+I">Irfan Essa</a>, <a href="/search/cs?searchtype=author&query=Ploetz%2C+T">Thomas Ploetz</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.12023v1-abstract-short" style="display: inline;"> Cross-modal contrastive pre-training between natural language and other modalities, e.g., vision and audio, has demonstrated astonishing performance and effectiveness across a diverse variety of tasks and domains. In this paper, we investigate whether such natural language supervision can be used for wearable sensor based Human Activity Recognition (HAR), and discover that-surprisingly-it performs… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.12023v1-abstract-full').style.display = 'inline'; document.getElementById('2408.12023v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.12023v1-abstract-full" style="display: none;"> Cross-modal contrastive pre-training between natural language and other modalities, e.g., vision and audio, has demonstrated astonishing performance and effectiveness across a diverse variety of tasks and domains. In this paper, we investigate whether such natural language supervision can be used for wearable sensor based Human Activity Recognition (HAR), and discover that-surprisingly-it performs substantially worse than standard end-to-end training and self-supervision. We identify the primary causes for this as: sensor heterogeneity and the lack of rich, diverse text descriptions of activities. To mitigate their impact, we also develop strategies and assess their effectiveness through an extensive experimental evaluation. These strategies lead to significant increases in activity recognition, bringing performance closer to supervised and self-supervised training, while also enabling the recognition of unseen activities and cross modal retrieval of videos. Overall, our work paves the way for better sensor-language learning, ultimately leading to the development of foundational models for HAR using wearables. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.12023v1-abstract-full').style.display = 'none'; document.getElementById('2408.12023v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.11821">arXiv:2408.11821</a> <span> [<a href="https://arxiv.org/pdf/2408.11821">pdf</a>, <a href="https://arxiv.org/format/2408.11821">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1016/j.iot.2024.101075">10.1016/j.iot.2024.101075 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> MIMA 2.0 -- Compact and Portable Multifunctional IoT integrated Menstrual Aid </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jyothish%2C+K+J">Kumar J. Jyothish</a>, <a href="/search/cs?searchtype=author&query=Shivangi%2C+S">Shreya Shivangi</a>, <a href="/search/cs?searchtype=author&query=Bibhu%2C+A">Amish Bibhu</a>, <a href="/search/cs?searchtype=author&query=Mishra%2C+S">Subhankar Mishra</a>, <a href="/search/cs?searchtype=author&query=Saha%2C+S">Sulagna Saha</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.11821v1-abstract-short" style="display: inline;"> The shredding intrauterine lining or the endometrium is known as Menstruation. It occurs every month and causes several issues like Menstrual Cramps and aches in the abdominal region, stains, menstrual malodor, rashes in intimate areas, and many more. In our research, almost all of the products available in the market do not cater to these problems single-handedly. There are few remedies available… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.11821v1-abstract-full').style.display = 'inline'; document.getElementById('2408.11821v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.11821v1-abstract-full" style="display: none;"> The shredding intrauterine lining or the endometrium is known as Menstruation. It occurs every month and causes several issues like Menstrual Cramps and aches in the abdominal region, stains, menstrual malodor, rashes in intimate areas, and many more. In our research, almost all of the products available in the market do not cater to these problems single-handedly. There are few remedies available to cater to the cramps, among which heat therapy is the most commonly used. Our methodology, involved surveys regarding problems and the solutions to these problems that are deemed optimal. This inclusive approach helped us infer about the gaps in available menstrual aids which has become our guide towards developing MIMA (Multifunctional IoT Integrated Menstrual Aid). In this paper, we have featured an IOT incorporated multifunctional smart intimate wear that aims to provide for the multiple necessities of women during menstruation like leakproof, antibacterial, anti-odor, rash-free experience along with an integrated Bluetooth-controlled intimate heat-pad for relieving abdominal cramps. The entire process of product development has been done in phases according to feedback from target users in each stage. This paper is an extension to our paper [1] which serves as the proof of concept for our approach. The development has led us towards MIMA 2.0 featuring a completely concealed and integrated design that includes a safe Bluetooth-controlled heating system for the intimate area. The product has received incredibly positive feedback from survey participants. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.11821v1-abstract-full').style.display = 'none'; document.getElementById('2408.11821v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> In Internet of Things (Vol. 25, p. 101075). Elsevier BV (2024) </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.07393">arXiv:2408.07393</a> <span> [<a href="https://arxiv.org/pdf/2408.07393">pdf</a>, <a href="https://arxiv.org/format/2408.07393">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Segment Using Just One Example </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Vora%2C+P">Pratik Vora</a>, <a href="/search/cs?searchtype=author&query=Saha%2C+S">Sudipan Saha</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.07393v1-abstract-short" style="display: inline;"> Semantic segmentation is an important topic in computer vision with many relevant application in Earth observation. While supervised methods exist, the constraints of limited annotated data has encouraged development of unsupervised approaches. However, existing unsupervised methods resemble clustering and cannot be directly mapped to explicit target classes. In this paper, we deal with single sho… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.07393v1-abstract-full').style.display = 'inline'; document.getElementById('2408.07393v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.07393v1-abstract-full" style="display: none;"> Semantic segmentation is an important topic in computer vision with many relevant application in Earth observation. While supervised methods exist, the constraints of limited annotated data has encouraged development of unsupervised approaches. However, existing unsupervised methods resemble clustering and cannot be directly mapped to explicit target classes. In this paper, we deal with single shot semantic segmentation, where one example for the target class is provided, which is used to segment the target class from query/test images. Our approach exploits recently popular Segment Anything (SAM), a promptable foundation model. We specifically design several techniques to automatically generate prompts from the only example/key image in such a way that the segmentation is successfully achieved on a stitch or concatenation of the example/key and query/test images. Proposed technique does not involve any training phase and just requires one example image to grasp the concept. Furthermore, no text-based prompt is required for the proposed method. We evaluated the proposed techniques on building and car classes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.07393v1-abstract-full').style.display = 'none'; document.getElementById('2408.07393v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.06644">arXiv:2408.06644</a> <span> [<a href="https://arxiv.org/pdf/2408.06644">pdf</a>, <a href="https://arxiv.org/format/2408.06644">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Specialized Change Detection using Segment Anything </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ahmad%2C+T">Tahir Ahmad</a>, <a href="/search/cs?searchtype=author&query=Saha%2C+S">Sudipan Saha</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.06644v1-abstract-short" style="display: inline;"> Change detection (CD) is a fundamental task in Earth observation. While most change detection methods detect all changes, there is a growing need for specialized methods targeting specific changes relevant to particular applications while discarding the other changes. For instance, urban management might prioritize detecting the disappearance of buildings due to natural disasters or other reasons.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.06644v1-abstract-full').style.display = 'inline'; document.getElementById('2408.06644v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.06644v1-abstract-full" style="display: none;"> Change detection (CD) is a fundamental task in Earth observation. While most change detection methods detect all changes, there is a growing need for specialized methods targeting specific changes relevant to particular applications while discarding the other changes. For instance, urban management might prioritize detecting the disappearance of buildings due to natural disasters or other reasons. Furthermore, while most supervised change detection methods require large-scale training datasets, in many applications only one or two training examples might be available instead of large datasets. Addressing such needs, we propose a focused CD approach using the Segment Anything Model (SAM), a versatile vision foundation model. Our method leverages a binary mask of the object of interest in pre-change images to detect their disappearance in post-change images. By using SAM's robust segmentation capabilities, we create prompts from the pre-change mask, use those prompts to segment the post-change image, and identify missing objects. This unsupervised approach demonstrated for building disappearance detection, is adaptable to various domains requiring specialized CD. Our contributions include defining a novel CD problem, proposing a method using SAM, and demonstrating its effectiveness. The proposed method also has benefits related to privacy preservation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.06644v1-abstract-full').style.display = 'none'; document.getElementById('2408.06644v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.05916">arXiv:2408.05916</a> <span> [<a href="https://arxiv.org/pdf/2408.05916">pdf</a>, <a href="https://arxiv.org/format/2408.05916">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Cluster-Segregate-Perturb (CSP): A Model-agnostic Explainability Pipeline for Spatiotemporal Land Surface Forecasting Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Verma%2C+T">Tushar Verma</a>, <a href="/search/cs?searchtype=author&query=Saha%2C+S">Sudipan Saha</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.05916v1-abstract-short" style="display: inline;"> Satellite images have become increasingly valuable for modelling regional climate change effects. Earth surface forecasting represents one such task that integrates satellite images with meteorological data to capture the joint evolution of regional climate change effects. However, understanding the complex relationship between specific meteorological variables and land surface evolution poses a s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.05916v1-abstract-full').style.display = 'inline'; document.getElementById('2408.05916v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.05916v1-abstract-full" style="display: none;"> Satellite images have become increasingly valuable for modelling regional climate change effects. Earth surface forecasting represents one such task that integrates satellite images with meteorological data to capture the joint evolution of regional climate change effects. However, understanding the complex relationship between specific meteorological variables and land surface evolution poses a significant challenge. In light of this challenge, our paper introduces a pipeline that integrates principles from both perturbation-based explainability techniques like LIME and global marginal explainability techniques like PDP, besides addressing the constraints of using such techniques when applying them to high-dimensional spatiotemporal deep models. The proposed pipeline simplifies the undertaking of diverse investigative analyses, such as marginal sensitivity analysis, marginal correlation analysis, lag analysis, etc., on complex land surface forecasting models In this study we utilised Convolutional Long Short-Term Memory (ConvLSTM) as the surface forecasting model and did analyses on the Normalized Difference Vegetation Index (NDVI) of the surface forecasts, since meteorological variables like temperature, pressure, and precipitation significantly influence it. The study area encompasses various regions in Europe. Our analyses show that precipitation exhibits the highest sensitivity in the study area, followed by temperature and pressure. Pressure has little to no direct effect on NDVI. Additionally, interesting nonlinear correlations between meteorological variables and NDVI have been uncovered. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.05916v1-abstract-full').style.display = 'none'; document.getElementById('2408.05916v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.15237">arXiv:2407.15237</a> <span> [<a href="https://arxiv.org/pdf/2407.15237">pdf</a>, <a href="https://arxiv.org/format/2407.15237">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Two eyes, Two views, and finally, One summary! Towards Multi-modal Multi-tasking Knowledge-Infused Medical Dialogue Summarization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Saha%2C+A">Anisha Saha</a>, <a href="/search/cs?searchtype=author&query=Tiwari%2C+A">Abhisek Tiwari</a>, <a href="/search/cs?searchtype=author&query=Ruthvik%2C+S">Sai Ruthvik</a>, <a href="/search/cs?searchtype=author&query=Saha%2C+S">Sriparna Saha</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.15237v1-abstract-short" style="display: inline;"> We often summarize a multi-party conversation in two stages: chunking with homogeneous units and summarizing the chunks. Thus, we hypothesize that there exists a correlation between homogeneous speaker chunking and overall summarization tasks. In this work, we investigate the effectiveness of a multi-faceted approach that simultaneously produces summaries of medical concerns, doctor impressions, a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.15237v1-abstract-full').style.display = 'inline'; document.getElementById('2407.15237v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.15237v1-abstract-full" style="display: none;"> We often summarize a multi-party conversation in two stages: chunking with homogeneous units and summarizing the chunks. Thus, we hypothesize that there exists a correlation between homogeneous speaker chunking and overall summarization tasks. In this work, we investigate the effectiveness of a multi-faceted approach that simultaneously produces summaries of medical concerns, doctor impressions, and an overall view. We introduce a multi-modal, multi-tasking, knowledge-infused medical dialogue summary generation (MMK-Summation) model, which is incorporated with adapter-based fine-tuning through a gated mechanism for multi-modal information integration. The model, MMK-Summation, takes dialogues as input, extracts pertinent external knowledge based on the context, integrates the knowledge and visual cues from the dialogues into the textual content, and ultimately generates concise summaries encompassing medical concerns, doctor impressions, and a comprehensive overview. The introduced model surpasses multiple baselines and traditional summarization models across all evaluation metrics (including human evaluation), which firmly demonstrates the efficacy of the knowledge-guided multi-tasking, multimodal medical conversation summarization. The code is available at https://github.com/NLP-RL/MMK-Summation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.15237v1-abstract-full').style.display = 'none'; document.getElementById('2407.15237v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.14414">arXiv:2407.14414</a> <span> [<a href="https://arxiv.org/pdf/2407.14414">pdf</a>, <a href="https://arxiv.org/format/2407.14414">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> System-1.x: Learning to Balance Fast and Slow Planning with Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Saha%2C+S">Swarnadeep Saha</a>, <a href="/search/cs?searchtype=author&query=Prasad%2C+A">Archiki Prasad</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J+C">Justin Chih-Yao Chen</a>, <a href="/search/cs?searchtype=author&query=Hase%2C+P">Peter Hase</a>, <a href="/search/cs?searchtype=author&query=Stengel-Eskin%2C+E">Elias Stengel-Eskin</a>, <a href="/search/cs?searchtype=author&query=Bansal%2C+M">Mohit Bansal</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.14414v1-abstract-short" style="display: inline;"> Language models can be used to solve long-horizon planning problems in two distinct modes: a fast 'System-1' mode, directly generating plans without any explicit search or backtracking, and a slow 'System-2' mode, planning step-by-step by explicitly searching over possible actions. While System-2 is typically more effective, it is also more computationally expensive, making it infeasible for long… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.14414v1-abstract-full').style.display = 'inline'; document.getElementById('2407.14414v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.14414v1-abstract-full" style="display: none;"> Language models can be used to solve long-horizon planning problems in two distinct modes: a fast 'System-1' mode, directly generating plans without any explicit search or backtracking, and a slow 'System-2' mode, planning step-by-step by explicitly searching over possible actions. While System-2 is typically more effective, it is also more computationally expensive, making it infeasible for long plans or large action spaces. Moreover, isolated System-1 or 2 ignores the user's end goals, failing to provide ways to control the model's behavior. To this end, we propose the System-1.x Planner, a controllable planning framework with LLMs that is capable of generating hybrid plans and balancing between the two planning modes based on the difficulty of the problem at hand. System-1.x consists of (i) a controller, (ii) a System-1 Planner, and (iii) a System-2 Planner. Based on a user-specified hybridization factor (x) governing the mixture between System-1 and 2, the controller decomposes a problem into sub-goals, and classifies them as easy or hard to be solved by either System-1 or 2, respectively. We fine-tune all three components on top of a single base LLM, requiring only search traces as supervision. Experiments with two diverse planning tasks -- Maze Navigation and Blocksworld -- show that our System-1.x Planner outperforms a System-1 Planner, a System-2 Planner trained to approximate A* search, and also a symbolic planner (A*). We demonstrate the following key properties of our planner: (1) controllability: increasing the hybridization factor (e.g., System-1.75 vs 1.5) performs more search, improving performance, (2) flexibility: by building a neuro-symbolic variant with a neural System-1 and a symbolic System-2, we can use existing symbolic methods, and (3) generalizability: by being able to learn from different search algorithms, our method is robust to the choice of search algorithm. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.14414v1-abstract-full').style.display = 'none'; document.getElementById('2407.14414v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">29 pages (10 tables)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.05800">arXiv:2407.05800</a> <span> [<a href="https://arxiv.org/pdf/2407.05800">pdf</a>, <a href="https://arxiv.org/format/2407.05800">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> FedMRL: Data Heterogeneity Aware Federated Multi-agent Deep Reinforcement Learning for Medical Imaging </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sahoo%2C+P">Pranab Sahoo</a>, <a href="/search/cs?searchtype=author&query=Tripathi%2C+A">Ashutosh Tripathi</a>, <a href="/search/cs?searchtype=author&query=Saha%2C+S">Sriparna Saha</a>, <a href="/search/cs?searchtype=author&query=Mondal%2C+S">Samrat Mondal</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.05800v1-abstract-short" style="display: inline;"> Despite recent advancements in federated learning (FL) for medical image diagnosis, addressing data heterogeneity among clients remains a significant challenge for practical implementation. A primary hurdle in FL arises from the non-IID nature of data samples across clients, which typically results in a decline in the performance of the aggregated global model. In this study, we introduce FedMRL,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.05800v1-abstract-full').style.display = 'inline'; document.getElementById('2407.05800v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.05800v1-abstract-full" style="display: none;"> Despite recent advancements in federated learning (FL) for medical image diagnosis, addressing data heterogeneity among clients remains a significant challenge for practical implementation. A primary hurdle in FL arises from the non-IID nature of data samples across clients, which typically results in a decline in the performance of the aggregated global model. In this study, we introduce FedMRL, a novel federated multi-agent deep reinforcement learning framework designed to address data heterogeneity. FedMRL incorporates a novel loss function to facilitate fairness among clients, preventing bias in the final global model. Additionally, it employs a multi-agent reinforcement learning (MARL) approach to calculate the proximal term $(渭)$ for the personalized local objective function, ensuring convergence to the global optimum. Furthermore, FedMRL integrates an adaptive weight adjustment method using a Self-organizing map (SOM) on the server side to counteract distribution shifts among clients' local data distributions. We assess our approach using two publicly available real-world medical datasets, and the results demonstrate that FedMRL significantly outperforms state-of-the-art techniques, showing its efficacy in addressing data heterogeneity in federated learning. The code can be found here~{\url{https://github.com/Pranabiitp/FedMRL}}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.05800v1-abstract-full').style.display = 'none'; document.getElementById('2407.05800v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to MICCAI 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.02362">arXiv:2407.02362</a> <span> [<a href="https://arxiv.org/pdf/2407.02362">pdf</a>, <a href="https://arxiv.org/format/2407.02362">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Fast, Scalable, Energy-Efficient Non-element-wise Matrix Multiplication on FPGA </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+X">Xuqi Zhu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Huaizhi Zhang</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+J">JunKyu Lee</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jiacheng Zhu</a>, <a href="/search/cs?searchtype=author&query=Pal%2C+C">Chandrajit Pal</a>, <a href="/search/cs?searchtype=author&query=Saha%2C+S">Sangeet Saha</a>, <a href="/search/cs?searchtype=author&query=McDonald-Maier%2C+K+D">Klaus D. McDonald-Maier</a>, <a href="/search/cs?searchtype=author&query=Zhai%2C+X">Xiaojun Zhai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.02362v2-abstract-short" style="display: inline;"> Modern Neural Network (NN) architectures heavily rely on vast numbers of multiply-accumulate arithmetic operations, constituting the predominant computational cost. Therefore, this paper proposes a high-throughput, scalable and energy efficient non-element-wise matrix multiplication unit on FPGAs as a basic component of the NNs. We firstly streamline inter-layer and intra-layer redundancies of MAD… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.02362v2-abstract-full').style.display = 'inline'; document.getElementById('2407.02362v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.02362v2-abstract-full" style="display: none;"> Modern Neural Network (NN) architectures heavily rely on vast numbers of multiply-accumulate arithmetic operations, constituting the predominant computational cost. Therefore, this paper proposes a high-throughput, scalable and energy efficient non-element-wise matrix multiplication unit on FPGAs as a basic component of the NNs. We firstly streamline inter-layer and intra-layer redundancies of MADDNESS algorithm, a LUT-based approximate matrix multiplication, to design a fast, efficient scalable approximate matrix multiplication module termed "Approximate Multiplication Unit (AMU)". The AMU optimizes LUT-based matrix multiplications further through dedicated memory management and access design, decoupling computational overhead from input resolution and boosting FPGA-based NN accelerator efficiency significantly. The experimental results show that using our AMU achieves up to 9x higher throughput and 112x higher energy efficiency over the state-of-the-art solutions for the FPGA-based Quantised Neural Network (QNN) accelerators. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.02362v2-abstract-full').style.display = 'none'; document.getElementById('2407.02362v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.00480">arXiv:2407.00480</a> <span> [<a href="https://arxiv.org/pdf/2407.00480">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Development of an interactive GUI using MATLAB for the detection of type and stage of Breast Tumor </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Banerjee%2C+P">Poulmi Banerjee</a>, <a href="/search/cs?searchtype=author&query=Saha%2C+S">Satadal Saha</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.00480v1-abstract-short" style="display: inline;"> Breast cancer is described as one of the most common types of cancer which has been diagnosed mainly in women. When compared in the ratio of male to female, it has been duly found that the prone of having breast cancer is more in females than males. Breast lumps are classified mainly into two groups namely: cancerous and non-cancerous. When we say that the lump in the breast is cancerous, it means… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.00480v1-abstract-full').style.display = 'inline'; document.getElementById('2407.00480v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.00480v1-abstract-full" style="display: none;"> Breast cancer is described as one of the most common types of cancer which has been diagnosed mainly in women. When compared in the ratio of male to female, it has been duly found that the prone of having breast cancer is more in females than males. Breast lumps are classified mainly into two groups namely: cancerous and non-cancerous. When we say that the lump in the breast is cancerous, it means that it can spread via lobules, ducts, areola, stroma to various organs of the body. On the other hand, non-cancerous breast lumps are less harmful but it should be monitored under proper diagnosis to avoid it being transformed to cancerous lump. To diagnose these breast lumps the method of mammogram, ultrasonic images and MRI images are undertaken. Also, for better diagnosis sometimes doctors recommend for biopsy and any unforeseen anomalies occurring there may give rise to inaccurate test report. To avoid these discrepancies, processing the mammogram images is considered to be one of the most reliable methods. In the proposed method MATLAB GUI is developed and some sample images of breast lumps are placed accordingly in the respective axes. With the help of sliders the actual breast lump image is compared with the already stored breast lump sample images and then accordingly the history of the breast lumps is generated in real time in the form of test report. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.00480v1-abstract-full').style.display = 'none'; document.getElementById('2407.00480v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.19678">arXiv:2406.19678</a> <span> [<a href="https://arxiv.org/pdf/2406.19678">pdf</a>, <a href="https://arxiv.org/format/2406.19678">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.31256/HSMR2024.60">10.31256/HSMR2024.60 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> UltraGelBot: Autonomous Gel Dispenser for Robotic Ultrasound </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Raina%2C+D">Deepak Raina</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Ziming Zhao</a>, <a href="/search/cs?searchtype=author&query=Voyles%2C+R">Richard Voyles</a>, <a href="/search/cs?searchtype=author&query=Wachs%2C+J">Juan Wachs</a>, <a href="/search/cs?searchtype=author&query=Saha%2C+S+K">Subir K. Saha</a>, <a href="/search/cs?searchtype=author&query=Chandrashekhara%2C+S+H">S. H. Chandrashekhara</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.19678v1-abstract-short" style="display: inline;"> Telerobotic and Autonomous Robotic Ultrasound Systems (RUS) help alleviate the need for operator-dependability in free-hand ultrasound examinations. However, the state-of-the-art RUSs still rely on a human operator to apply the ultrasound gel. The lack of standardization in this process often leads to poor imaging of the scanned region. The reason for this has to do with air-gaps between the probe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.19678v1-abstract-full').style.display = 'inline'; document.getElementById('2406.19678v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.19678v1-abstract-full" style="display: none;"> Telerobotic and Autonomous Robotic Ultrasound Systems (RUS) help alleviate the need for operator-dependability in free-hand ultrasound examinations. However, the state-of-the-art RUSs still rely on a human operator to apply the ultrasound gel. The lack of standardization in this process often leads to poor imaging of the scanned region. The reason for this has to do with air-gaps between the probe and the human body. In this paper, we developed a end-of-arm tool for RUS, referred to as UltraGelBot. This bot can autonomously detect and dispense the gel. It uses a deep learning model to detect the gel from images acquired using an on-board camera. A motorized mechanism is also developed, which will use this feedback and dispense the gel. Experiments on phantom revealed that UltraGelBot increases the acquired image quality by $18.6\%$ and reduces the procedure time by $37.2\%$. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.19678v1-abstract-full').style.display = 'none'; document.getElementById('2406.19678v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">2024 16th Hamlyn Symposium on Medical Robotics (HSMR)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.13272">arXiv:2406.13272</a> <span> [<a href="https://arxiv.org/pdf/2406.13272">pdf</a>, <a href="https://arxiv.org/format/2406.13272">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> AniFaceDiff: High-Fidelity Face Reenactment via Facial Parametric Conditioned Diffusion Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+K">Ken Chen</a>, <a href="/search/cs?searchtype=author&query=Seneviratne%2C+S">Sachith Seneviratne</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wei Wang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+D">Dongting Hu</a>, <a href="/search/cs?searchtype=author&query=Saha%2C+S">Sanjay Saha</a>, <a href="/search/cs?searchtype=author&query=Hasan%2C+M+T">Md. Tarek Hasan</a>, <a href="/search/cs?searchtype=author&query=Rasnayaka%2C+S">Sanka Rasnayaka</a>, <a href="/search/cs?searchtype=author&query=Malepathirana%2C+T">Tamasha Malepathirana</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+M">Mingming Gong</a>, <a href="/search/cs?searchtype=author&query=Halgamuge%2C+S">Saman Halgamuge</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.13272v1-abstract-short" style="display: inline;"> Face reenactment refers to the process of transferring the pose and facial expressions from a reference (driving) video onto a static facial (source) image while maintaining the original identity of the source image. Previous research in this domain has made significant progress by training controllable deep generative models to generate faces based on specific identity, pose and expression condit… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.13272v1-abstract-full').style.display = 'inline'; document.getElementById('2406.13272v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.13272v1-abstract-full" style="display: none;"> Face reenactment refers to the process of transferring the pose and facial expressions from a reference (driving) video onto a static facial (source) image while maintaining the original identity of the source image. Previous research in this domain has made significant progress by training controllable deep generative models to generate faces based on specific identity, pose and expression conditions. However, the mechanisms used in these methods to control pose and expression often inadvertently introduce identity information from the driving video, while also causing a loss of expression-related details. This paper proposes a new method based on Stable Diffusion, called AniFaceDiff, incorporating a new conditioning module for high-fidelity face reenactment. First, we propose an enhanced 2D facial snapshot conditioning approach by facial shape alignment to prevent the inclusion of identity information from the driving video. Then, we introduce an expression adapter conditioning mechanism to address the potential loss of expression-related information. Our approach effectively preserves pose and expression fidelity from the driving video while retaining the identity and fine details of the source image. Through experiments on the VoxCeleb dataset, we demonstrate that our method achieves state-of-the-art results in face reenactment, showcasing superior image quality, identity preservation, and expression accuracy, especially for cross-identity scenarios. Considering the ethical concerns surrounding potential misuse, we analyze the implications of our method, evaluate current state-of-the-art deepfake detectors, and identify their shortcomings to guide future research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.13272v1-abstract-full').style.display = 'none'; document.getElementById('2406.13272v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.12931">arXiv:2406.12931</a> <span> [<a href="https://arxiv.org/pdf/2406.12931">pdf</a>, <a href="https://arxiv.org/format/2406.12931">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Automatic Speech Recognition for Biomedical Data in Bengali Language </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kabir%2C+S">Shariar Kabir</a>, <a href="/search/cs?searchtype=author&query=Nahar%2C+N">Nazmun Nahar</a>, <a href="/search/cs?searchtype=author&query=Saha%2C+S">Shyamasree Saha</a>, <a href="/search/cs?searchtype=author&query=Rashid%2C+M">Mamunur Rashid</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.12931v1-abstract-short" style="display: inline;"> This paper presents the development of a prototype Automatic Speech Recognition (ASR) system specifically designed for Bengali biomedical data. Recent advancements in Bengali ASR are encouraging, but a lack of domain-specific data limits the creation of practical healthcare ASR models. This project bridges this gap by developing an ASR system tailored for Bengali medical terms like symptoms, sever… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.12931v1-abstract-full').style.display = 'inline'; document.getElementById('2406.12931v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.12931v1-abstract-full" style="display: none;"> This paper presents the development of a prototype Automatic Speech Recognition (ASR) system specifically designed for Bengali biomedical data. Recent advancements in Bengali ASR are encouraging, but a lack of domain-specific data limits the creation of practical healthcare ASR models. This project bridges this gap by developing an ASR system tailored for Bengali medical terms like symptoms, severity levels, and diseases, encompassing two major dialects: Bengali and Sylheti. We train and evaluate two popular ASR frameworks on a comprehensive 46-hour Bengali medical corpus. Our core objective is to create deployable health-domain ASR systems for digital health applications, ultimately increasing accessibility for non-technical users in the healthcare sector. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.12931v1-abstract-full').style.display = 'none'; document.getElementById('2406.12931v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.09043">arXiv:2406.09043</a> <span> [<a href="https://arxiv.org/pdf/2406.09043">pdf</a>, <a href="https://arxiv.org/format/2406.09043">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Language Models are Crossword Solvers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Saha%2C+S">Soumadeep Saha</a>, <a href="/search/cs?searchtype=author&query=Chakraborty%2C+S">Sutanoya Chakraborty</a>, <a href="/search/cs?searchtype=author&query=Saha%2C+S">Saptarshi Saha</a>, <a href="/search/cs?searchtype=author&query=Garain%2C+U">Utpal Garain</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.09043v2-abstract-short" style="display: inline;"> Crosswords are a form of word puzzle that require a solver to demonstrate a high degree of proficiency in natural language understanding, wordplay, reasoning, and world knowledge, along with adherence to character and length constraints. In this paper we tackle the challenge of solving crosswords with Large Language Models (LLMs). We demonstrate that the current generation of state-of-the art (SoT… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.09043v2-abstract-full').style.display = 'inline'; document.getElementById('2406.09043v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.09043v2-abstract-full" style="display: none;"> Crosswords are a form of word puzzle that require a solver to demonstrate a high degree of proficiency in natural language understanding, wordplay, reasoning, and world knowledge, along with adherence to character and length constraints. In this paper we tackle the challenge of solving crosswords with Large Language Models (LLMs). We demonstrate that the current generation of state-of-the art (SoTA) language models show significant competence at deciphering cryptic crossword clues, and outperform previously reported SoTA results by a factor of 2-3 in relevant benchmarks. We also develop a search algorithm that builds off this performance to tackle the problem of solving full crossword grids with LLMs for the very first time, achieving an accuracy of 93\% on New York Times crossword puzzles. Contrary to previous work in this area which concluded that LLMs lag human expert performance significantly, our research suggests this gap is a lot narrower. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.09043v2-abstract-full').style.display = 'none'; document.getElementById('2406.09043v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Edited to include missing citation</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.7 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.05598">arXiv:2406.05598</a> <span> [<a href="https://arxiv.org/pdf/2406.05598">pdf</a>, <a href="https://arxiv.org/format/2406.05598">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Understanding Inhibition Through Maximally Tense Images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hamblin%2C+C">Chris Hamblin</a>, <a href="/search/cs?searchtype=author&query=Saha%2C+S">Srijani Saha</a>, <a href="/search/cs?searchtype=author&query=Konkle%2C+T">Talia Konkle</a>, <a href="/search/cs?searchtype=author&query=Alvarez%2C+G">George Alvarez</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.05598v1-abstract-short" style="display: inline;"> We address the functional role of 'feature inhibition' in vision models; that is, what are the mechanisms by which a neural network ensures images do not express a given feature? We observe that standard interpretability tools in the literature are not immediately suited to the inhibitory case, given the asymmetry introduced by the ReLU activation function. Given this, we propose inhibition be und… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.05598v1-abstract-full').style.display = 'inline'; document.getElementById('2406.05598v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.05598v1-abstract-full" style="display: none;"> We address the functional role of 'feature inhibition' in vision models; that is, what are the mechanisms by which a neural network ensures images do not express a given feature? We observe that standard interpretability tools in the literature are not immediately suited to the inhibitory case, given the asymmetry introduced by the ReLU activation function. Given this, we propose inhibition be understood through a study of 'maximally tense images' (MTIs), i.e. those images that excite and inhibit a given feature simultaneously. We show how MTIs can be studied with two novel visualization techniques; +/- attribution inversions, which split single images into excitatory and inhibitory components, and the attribution atlas, which provides a global visualization of the various ways images can excite/inhibit a feature. Finally, we explore the difficulties introduced by superposition, as such interfering features induce the same attribution motif as MTIs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.05598v1-abstract-full').style.display = 'none'; document.getElementById('2406.05598v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.05344">arXiv:2406.05344</a> <span> [<a href="https://arxiv.org/pdf/2406.05344">pdf</a>, <a href="https://arxiv.org/format/2406.05344">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> MemeGuard: An LLM and VLM-based Framework for Advancing Content Moderation via Meme Intervention </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jha%2C+P">Prince Jha</a>, <a href="/search/cs?searchtype=author&query=Jain%2C+R">Raghav Jain</a>, <a href="/search/cs?searchtype=author&query=Mandal%2C+K">Konika Mandal</a>, <a href="/search/cs?searchtype=author&query=Chadha%2C+A">Aman Chadha</a>, <a href="/search/cs?searchtype=author&query=Saha%2C+S">Sriparna Saha</a>, <a href="/search/cs?searchtype=author&query=Bhattacharyya%2C+P">Pushpak Bhattacharyya</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.05344v1-abstract-short" style="display: inline;"> In the digital world, memes present a unique challenge for content moderation due to their potential to spread harmful content. Although detection methods have improved, proactive solutions such as intervention are still limited, with current research focusing mostly on text-based content, neglecting the widespread influence of multimodal content like memes. Addressing this gap, we present \textit… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.05344v1-abstract-full').style.display = 'inline'; document.getElementById('2406.05344v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.05344v1-abstract-full" style="display: none;"> In the digital world, memes present a unique challenge for content moderation due to their potential to spread harmful content. Although detection methods have improved, proactive solutions such as intervention are still limited, with current research focusing mostly on text-based content, neglecting the widespread influence of multimodal content like memes. Addressing this gap, we present \textit{MemeGuard}, a comprehensive framework leveraging Large Language Models (LLMs) and Visual Language Models (VLMs) for meme intervention. \textit{MemeGuard} harnesses a specially fine-tuned VLM, \textit{VLMeme}, for meme interpretation, and a multimodal knowledge selection and ranking mechanism (\textit{MKS}) for distilling relevant knowledge. This knowledge is then employed by a general-purpose LLM to generate contextually appropriate interventions. Another key contribution of this work is the \textit{\textbf{I}ntervening} \textit{\textbf{C}yberbullying in \textbf{M}ultimodal \textbf{M}emes (ICMM)} dataset, a high-quality, labeled dataset featuring toxic memes and their corresponding human-annotated interventions. We leverage \textit{ICMM} to test \textit{MemeGuard}, demonstrating its proficiency in generating relevant and effective responses to toxic memes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.05344v1-abstract-full').style.display = 'none'; document.getElementById('2406.05344v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.03556">arXiv:2406.03556</a> <span> [<a href="https://arxiv.org/pdf/2406.03556">pdf</a>, <a href="https://arxiv.org/format/2406.03556">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/ACCESS.2024.3424662">10.1109/ACCESS.2024.3424662 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Npix2Cpix: A GAN-Based Image-to-Image Translation Network With Retrieval- Classification Integration for Watermark Retrieval From Historical Document Images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Saha%2C+U">Utsab Saha</a>, <a href="/search/cs?searchtype=author&query=Saha%2C+S">Sawradip Saha</a>, <a href="/search/cs?searchtype=author&query=Fattah%2C+S+A">Shaikh Anowarul Fattah</a>, <a href="/search/cs?searchtype=author&query=Saquib%2C+M">Mohammad Saquib</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.03556v3-abstract-short" style="display: inline;"> The identification and restoration of ancient watermarks have long been a major topic in codicology and history. Classifying historical documents based on watermarks is challenging due to their diversity, noisy samples, multiple representation modes, and minor distinctions between classes and intra-class variations. This paper proposes a modified U-net-based conditional generative adversarial netw… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.03556v3-abstract-full').style.display = 'inline'; document.getElementById('2406.03556v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.03556v3-abstract-full" style="display: none;"> The identification and restoration of ancient watermarks have long been a major topic in codicology and history. Classifying historical documents based on watermarks is challenging due to their diversity, noisy samples, multiple representation modes, and minor distinctions between classes and intra-class variations. This paper proposes a modified U-net-based conditional generative adversarial network (GAN) named Npix2Cpix to translate noisy raw historical watermarked images into clean, handwriting-free watermarked images by performing image translation from degraded (noisy) pixels to clean pixels. Using image-to-image translation and adversarial learning, the network creates clutter-free images for watermark restoration and categorization. The generator and discriminator of the proposed GAN are trained using two separate loss functions, each based on the distance between images, to learn the mapping from the input noisy image to the output clean image. After using the proposed GAN to pre-process noisy watermarked images, Siamese-based one-shot learning is employed for watermark classification. Experimental results on a large-scale historical watermark dataset demonstrate that cleaning the noisy watermarked images can help to achieve high one-shot classification accuracy. The qualitative and quantitative evaluation of the retrieved watermarked image highlights the effectiveness of the proposed approach. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.03556v3-abstract-full').style.display = 'none'; document.getElementById('2406.03556v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> IEEE Access 12 (2024) 95857-95870 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.02450">arXiv:2406.02450</a> <span> [<a href="https://arxiv.org/pdf/2406.02450">pdf</a>, <a href="https://arxiv.org/format/2406.02450">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> A Generalized Apprenticeship Learning Framework for Modeling Heterogeneous Student Pedagogical Strategies </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Islam%2C+M+M">Md Mirajul Islam</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xi Yang</a>, <a href="/search/cs?searchtype=author&query=Hostetter%2C+J">John Hostetter</a>, <a href="/search/cs?searchtype=author&query=Saha%2C+A+S">Adittya Soukarjya Saha</a>, <a href="/search/cs?searchtype=author&query=Chi%2C+M">Min Chi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.02450v1-abstract-short" style="display: inline;"> A key challenge in e-learning environments like Intelligent Tutoring Systems (ITSs) is to induce effective pedagogical policies efficiently. While Deep Reinforcement Learning (DRL) often suffers from sample inefficiency and reward function design difficulty, Apprenticeship Learning(AL) algorithms can overcome them. However, most AL algorithms can not handle heterogeneity as they assume all demonst… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.02450v1-abstract-full').style.display = 'inline'; document.getElementById('2406.02450v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.02450v1-abstract-full" style="display: none;"> A key challenge in e-learning environments like Intelligent Tutoring Systems (ITSs) is to induce effective pedagogical policies efficiently. While Deep Reinforcement Learning (DRL) often suffers from sample inefficiency and reward function design difficulty, Apprenticeship Learning(AL) algorithms can overcome them. However, most AL algorithms can not handle heterogeneity as they assume all demonstrations are generated with a homogeneous policy driven by a single reward function. Still, some AL algorithms which consider heterogeneity, often can not generalize to large continuous state space and only work with discrete states. In this paper, we propose an expectation-maximization(EM)-EDM, a general AL framework to induce effective pedagogical policies from given optimal or near-optimal demonstrations, which are assumed to be driven by heterogeneous reward functions. We compare the effectiveness of the policies induced by our proposed EM-EDM against four AL-based baselines and two policies induced by DRL on two different but related tasks that involve pedagogical action prediction. Our overall results showed that, for both tasks, EM-EDM outperforms the four AL baselines across all performance metrics and the two DRL baselines. This suggests that EM-EDM can effectively model complex student pedagogical decision-making processes through the ability to manage a large, continuous state space and adapt to handle diverse and heterogeneous reward functions with very few given demonstrations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.02450v1-abstract-full').style.display = 'none'; document.getElementById('2406.02450v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.20628">arXiv:2405.20628</a> <span> [<a href="https://arxiv.org/pdf/2405.20628">pdf</a>, <a href="https://arxiv.org/format/2405.20628">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ToxVidLM: A Multimodal Framework for Toxicity Detection in Code-Mixed Videos </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Maity%2C+K">Krishanu Maity</a>, <a href="/search/cs?searchtype=author&query=Poornash%2C+A+S">A. S. Poornash</a>, <a href="/search/cs?searchtype=author&query=Saha%2C+S">Sriparna Saha</a>, <a href="/search/cs?searchtype=author&query=Bhattacharyya%2C+P">Pushpak Bhattacharyya</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.20628v2-abstract-short" style="display: inline;"> In an era of rapidly evolving internet technology, the surge in multimodal content, including videos, has expanded the horizons of online communication. However, the detection of toxic content in this diverse landscape, particularly in low-resource code-mixed languages, remains a critical challenge. While substantial research has addressed toxic content detection in textual data, the realm of vide… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.20628v2-abstract-full').style.display = 'inline'; document.getElementById('2405.20628v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.20628v2-abstract-full" style="display: none;"> In an era of rapidly evolving internet technology, the surge in multimodal content, including videos, has expanded the horizons of online communication. However, the detection of toxic content in this diverse landscape, particularly in low-resource code-mixed languages, remains a critical challenge. While substantial research has addressed toxic content detection in textual data, the realm of video content, especially in non-English languages, has been relatively underexplored. This paper addresses this research gap by introducing a benchmark dataset, the first of its kind, consisting of 931 videos with 4021 code-mixed Hindi-English utterances collected from YouTube. Each utterance within this dataset has been meticulously annotated for toxicity, severity, and sentiment labels. We have developed an advanced Multimodal Multitask framework built for Toxicity detection in Video Content by leveraging Language Models (LMs), crafted for the primary objective along with the additional tasks of conducting sentiment and severity analysis. ToxVidLM incorporates three key modules - the Encoder module, Cross-Modal Synchronization module, and Multitask module - crafting a generic multimodal LM customized for intricate video classification tasks. Our experiments reveal that incorporating multiple modalities from the videos substantially enhances the performance of toxic content detection by achieving an Accuracy and Weighted F1 score of 94.29% and 94.35%, respectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.20628v2-abstract-full').style.display = 'none'; document.getElementById('2405.20628v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted as a Long Paper in ACL Findings 2024. For acceptance details, see https://2024.aclweb.org/program/finding_papers/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.18506">arXiv:2405.18506</a> <span> [<a href="https://arxiv.org/pdf/2405.18506">pdf</a>, <a href="https://arxiv.org/format/2405.18506">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Discrete Mathematics">cs.DM</span> </div> </div> <p class="title is-5 mathjax"> An Algorithm for the Decomposition of Complete Graph into Minimum Number of Edge-disjoint Trees </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sinha%2C+A">Antika Sinha</a>, <a href="/search/cs?searchtype=author&query=Saha%2C+S+K">Sanjoy Kumar Saha</a>, <a href="/search/cs?searchtype=author&query=Basuchowdhuri%2C+P">Partha Basuchowdhuri</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.18506v1-abstract-short" style="display: inline;"> In this work, we study methodical decomposition of an undirected, unweighted complete graph ($K_n$ of order $n$, size $m$) into minimum number of edge-disjoint trees. We find that $x$, a positive integer, is minimum and $x=\lceil\frac{n}{2}\rceil$ as the edge set of $K_n$ is decomposed into edge-disjoint trees of size sequence $M = \{m_1,m_2,...,m_x\}$ where $m_i\le(n-1)$ and $危_{i=1}^{x} m_i$ =… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.18506v1-abstract-full').style.display = 'inline'; document.getElementById('2405.18506v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.18506v1-abstract-full" style="display: none;"> In this work, we study methodical decomposition of an undirected, unweighted complete graph ($K_n$ of order $n$, size $m$) into minimum number of edge-disjoint trees. We find that $x$, a positive integer, is minimum and $x=\lceil\frac{n}{2}\rceil$ as the edge set of $K_n$ is decomposed into edge-disjoint trees of size sequence $M = \{m_1,m_2,...,m_x\}$ where $m_i\le(n-1)$ and $危_{i=1}^{x} m_i$ = $\frac{n(n-1)}{2}$. For decomposing the edge set of $K_n$ into minimum number of edge-disjoint trees, our proposed algorithm takes total $O(m)$ time. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.18506v1-abstract-full').style.display = 'none'; document.getElementById('2405.18506v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 4 figures and 3 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.15766">arXiv:2405.15766</a> <span> [<a href="https://arxiv.org/pdf/2405.15766">pdf</a>, <a href="https://arxiv.org/format/2405.15766">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.18653/v1/2024.findings-acl.667">10.18653/v1/2024.findings-acl.667 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Enhancing Adverse Drug Event Detection with Multimodal Dataset: Corpus Creation and Model Development </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sahoo%2C+P">Pranab Sahoo</a>, <a href="/search/cs?searchtype=author&query=Singh%2C+A+K">Ayush Kumar Singh</a>, <a href="/search/cs?searchtype=author&query=Saha%2C+S">Sriparna Saha</a>, <a href="/search/cs?searchtype=author&query=Chadha%2C+A">Aman Chadha</a>, <a href="/search/cs?searchtype=author&query=Mondal%2C+S">Samrat Mondal</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.15766v2-abstract-short" style="display: inline;"> The mining of adverse drug events (ADEs) is pivotal in pharmacovigilance, enhancing patient safety by identifying potential risks associated with medications, facilitating early detection of adverse events, and guiding regulatory decision-making. Traditional ADE detection methods are reliable but slow, not easily adaptable to large-scale operations, and offer limited information. With the exponent… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.15766v2-abstract-full').style.display = 'inline'; document.getElementById('2405.15766v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.15766v2-abstract-full" style="display: none;"> The mining of adverse drug events (ADEs) is pivotal in pharmacovigilance, enhancing patient safety by identifying potential risks associated with medications, facilitating early detection of adverse events, and guiding regulatory decision-making. Traditional ADE detection methods are reliable but slow, not easily adaptable to large-scale operations, and offer limited information. With the exponential increase in data sources like social media content, biomedical literature, and Electronic Medical Records (EMR), extracting relevant ADE-related information from these unstructured texts is imperative. Previous ADE mining studies have focused on text-based methodologies, overlooking visual cues, limiting contextual comprehension, and hindering accurate interpretation. To address this gap, we present a MultiModal Adverse Drug Event (MMADE) detection dataset, merging ADE-related textual information with visual aids. Additionally, we introduce a framework that leverages the capabilities of LLMs and VLMs for ADE detection by generating detailed descriptions of medical images depicting ADEs, aiding healthcare professionals in visually identifying adverse events. Using our MMADE dataset, we showcase the significance of integrating visual cues from images to enhance overall performance. This approach holds promise for patient safety, ADE awareness, and healthcare accessibility, paving the way for further exploration in personalized healthcare. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.15766v2-abstract-full').style.display = 'none'; document.getElementById('2405.15766v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ACL Findings 2024</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Report number:</span> 2024.findings-acl.667 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.11573">arXiv:2405.11573</a> <span> [<a href="https://arxiv.org/pdf/2405.11573">pdf</a>, <a href="https://arxiv.org/format/2405.11573">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Quantile Activation: departing from single point estimation for better generalization across distortions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Challa%2C+A">Aditya Challa</a>, <a href="/search/cs?searchtype=author&query=Danda%2C+S">Sravan Danda</a>, <a href="/search/cs?searchtype=author&query=Najman%2C+L">Laurent Najman</a>, <a href="/search/cs?searchtype=author&query=Saha%2C+S">Snehanshu Saha</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.11573v1-abstract-short" style="display: inline;"> A classifier is, in its essence, a function which takes an input and returns the class of the input and implicitly assumes an underlying distribution. We argue in this article that one has to move away from this basic tenet to obtain generalisation across distributions. Specifically, the class of the sample should depend on the points from its context distribution for better generalisation across… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.11573v1-abstract-full').style.display = 'inline'; document.getElementById('2405.11573v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.11573v1-abstract-full" style="display: none;"> A classifier is, in its essence, a function which takes an input and returns the class of the input and implicitly assumes an underlying distribution. We argue in this article that one has to move away from this basic tenet to obtain generalisation across distributions. Specifically, the class of the sample should depend on the points from its context distribution for better generalisation across distributions. How does one achieve this? The key idea is to adapt the outputs of each neuron of the network to its context distribution. We propose quantile activation, QACT, which, in simple terms, outputs the relative quantile of the sample in its context distribution, instead of the actual values in traditional networks. The scope of this article is to validate the proposed activation across several experimental settings, and compare it with conventional techniques. For this, we use the datasets developed to test robustness against distortions CIFAR10C, CIFAR100C, MNISTC, TinyImagenetC, and show that we achieve a significantly higher generalisation across distortions than the conventional classifiers, across different architectures. Although this paper is only a proof of concept, we surprisingly find that this approach outperforms DINOv2(small) at large distortions, even though DINOv2 is trained with a far bigger network on a considerably larger dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.11573v1-abstract-full').style.display = 'none'; document.getElementById('2405.11573v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.11181">arXiv:2405.11181</a> <span> [<a href="https://arxiv.org/pdf/2405.11181">pdf</a>, <a href="https://arxiv.org/format/2405.11181">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Towards Knowledge-Infused Automated Disease Diagnosis Assistant </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tomar%2C+M">Mohit Tomar</a>, <a href="/search/cs?searchtype=author&query=Tiwari%2C+A">Abhisek Tiwari</a>, <a href="/search/cs?searchtype=author&query=Saha%2C+S">Sriparna Saha</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.11181v1-abstract-short" style="display: inline;"> With the advancement of internet communication and telemedicine, people are increasingly turning to the web for various healthcare activities. With an ever-increasing number of diseases and symptoms, diagnosing patients becomes challenging. In this work, we build a diagnosis assistant to assist doctors, which identifies diseases based on patient-doctor interaction. During diagnosis, doctors utiliz… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.11181v1-abstract-full').style.display = 'inline'; document.getElementById('2405.11181v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.11181v1-abstract-full" style="display: none;"> With the advancement of internet communication and telemedicine, people are increasingly turning to the web for various healthcare activities. With an ever-increasing number of diseases and symptoms, diagnosing patients becomes challenging. In this work, we build a diagnosis assistant to assist doctors, which identifies diseases based on patient-doctor interaction. During diagnosis, doctors utilize both symptomatology knowledge and diagnostic experience to identify diseases accurately and efficiently. Inspired by this, we investigate the role of medical knowledge in disease diagnosis through doctor-patient interaction. We propose a two-channel, knowledge-infused, discourse-aware disease diagnosis model (KI-DDI), where the first channel encodes patient-doctor communication using a transformer-based encoder, while the other creates an embedding of symptom-disease using a graph attention network (GAT). In the next stage, the conversation and knowledge graph embeddings are infused together and fed to a deep neural network for disease identification. Furthermore, we first develop an empathetic conversational medical corpus comprising conversations between patients and doctors, annotated with intent and symptoms information. The proposed model demonstrates a significant improvement over the existing state-of-the-art models, establishing the crucial roles of (a) a doctor's effort for additional symptom extraction (in addition to patient self-report) and (b) infusing medical knowledge in identifying diseases effectively. Many times, patients also show their medical conditions, which acts as crucial evidence in diagnosis. Therefore, integrating visual sensory information would represent an effective avenue for enhancing the capabilities of diagnostic assistants. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.11181v1-abstract-full').style.display = 'none'; document.getElementById('2405.11181v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.09589">arXiv:2405.09589</a> <span> [<a href="https://arxiv.org/pdf/2405.09589">pdf</a>, <a href="https://arxiv.org/format/2405.09589">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> A Comprehensive Survey of Hallucination in Large Language, Image, Video and Audio Foundation Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sahoo%2C+P">Pranab Sahoo</a>, <a href="/search/cs?searchtype=author&query=Meharia%2C+P">Prabhash Meharia</a>, <a href="/search/cs?searchtype=author&query=Ghosh%2C+A">Akash Ghosh</a>, <a href="/search/cs?searchtype=author&query=Saha%2C+S">Sriparna Saha</a>, <a href="/search/cs?searchtype=author&query=Jain%2C+V">Vinija Jain</a>, <a href="/search/cs?searchtype=author&query=Chadha%2C+A">Aman Chadha</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.09589v4-abstract-short" style="display: inline;"> The rapid advancement of foundation models (FMs) across language, image, audio, and video domains has shown remarkable capabilities in diverse tasks. However, the proliferation of FMs brings forth a critical challenge: the potential to generate hallucinated outputs, particularly in high-stakes applications. The tendency of foundation models to produce hallucinated content arguably represents the b… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.09589v4-abstract-full').style.display = 'inline'; document.getElementById('2405.09589v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.09589v4-abstract-full" style="display: none;"> The rapid advancement of foundation models (FMs) across language, image, audio, and video domains has shown remarkable capabilities in diverse tasks. However, the proliferation of FMs brings forth a critical challenge: the potential to generate hallucinated outputs, particularly in high-stakes applications. The tendency of foundation models to produce hallucinated content arguably represents the biggest hindrance to their widespread adoption in real-world scenarios, especially in domains where reliability and accuracy are paramount. This survey paper presents a comprehensive overview of recent developments that aim to identify and mitigate the problem of hallucination in FMs, spanning text, image, video, and audio modalities. By synthesizing recent advancements in detecting and mitigating hallucination across various modalities, the paper aims to provide valuable insights for researchers, developers, and practitioners. Essentially, it establishes a clear framework encompassing definition, taxonomy, and detection strategies for addressing hallucination in multimodal foundation models, laying the foundation for future research in this pivotal area. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.09589v4-abstract-full').style.display = 'none'; document.getElementById('2405.09589v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">EMNLP 2024 Findings</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.06124">arXiv:2405.06124</a> <span> [<a href="https://arxiv.org/pdf/2405.06124">pdf</a>, <a href="https://arxiv.org/format/2405.06124">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Demystifying Behavior-Based Malware Detection at Endpoints </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kaya%2C+Y">Yigitcan Kaya</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yizheng Chen</a>, <a href="/search/cs?searchtype=author&query=Saha%2C+S">Shoumik Saha</a>, <a href="/search/cs?searchtype=author&query=Pierazzi%2C+F">Fabio Pierazzi</a>, <a href="/search/cs?searchtype=author&query=Cavallaro%2C+L">Lorenzo Cavallaro</a>, <a href="/search/cs?searchtype=author&query=Wagner%2C+D">David Wagner</a>, <a href="/search/cs?searchtype=author&query=Dumitras%2C+T">Tudor Dumitras</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.06124v1-abstract-short" style="display: inline;"> Machine learning is widely used for malware detection in practice. Prior behavior-based detectors most commonly rely on traces of programs executed in controlled sandboxes. However, sandbox traces are unavailable to the last line of defense offered by security vendors: malware detection at endpoints. A detector at endpoints consumes the traces of programs running on real-world hosts, as sandbox an… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.06124v1-abstract-full').style.display = 'inline'; document.getElementById('2405.06124v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.06124v1-abstract-full" style="display: none;"> Machine learning is widely used for malware detection in practice. Prior behavior-based detectors most commonly rely on traces of programs executed in controlled sandboxes. However, sandbox traces are unavailable to the last line of defense offered by security vendors: malware detection at endpoints. A detector at endpoints consumes the traces of programs running on real-world hosts, as sandbox analysis might introduce intolerable delays. Despite their success in the sandboxes, research hints at potential challenges for ML methods at endpoints, e.g., highly variable malware behaviors. Nonetheless, the impact of these challenges on existing approaches and how their excellent sandbox performance translates to the endpoint scenario remain unquantified. We present the first measurement study of the performance of ML-based malware detectors at real-world endpoints. Leveraging a dataset of sandbox traces and a dataset of in-the-wild program traces; we evaluate two scenarios where the endpoint detector was trained on (i) sandbox traces (convenient and accessible); and (ii) endpoint traces (less accessible due to needing to collect telemetry data). This allows us to identify a wide gap between prior methods' sandbox-based detection performance--over 90%--and endpoint performances--below 20% and 50% in (i) and (ii), respectively. We pinpoint and characterize the challenges contributing to this gap, such as label noise, behavior variability, or sandbox evasion. To close this gap, we propose that yield a relative improvement of 5-30% over the baselines. Our evidence suggests that applying detectors trained on sandbox data to endpoint detection -- scenario (i) -- is challenging. The most promising direction is training detectors on endpoint data -- scenario (ii) -- which marks a departure from widespread practice. We implement a leaderboard for realistic detector evaluations to promote research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.06124v1-abstract-full').style.display = 'none'; document.getElementById('2405.06124v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Behavior-based malware detection with machine learning. 18 pages, 10 figures, 15 tables. Leaderboard: https://malwaredetectioninthewild.github.io</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.05813">arXiv:2405.05813</a> <span> [<a href="https://arxiv.org/pdf/2405.05813">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> Revitalising Stagecraft: NLP-Driven Sentiment Analysis for Traditional Theater Revival </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Samanta%2C+S">Saikat Samanta</a>, <a href="/search/cs?searchtype=author&query=Karmakar%2C+S">Saptarshi Karmakar</a>, <a href="/search/cs?searchtype=author&query=Behuria%2C+S">Satayajay Behuria</a>, <a href="/search/cs?searchtype=author&query=Dutta%2C+S">Shibam Dutta</a>, <a href="/search/cs?searchtype=author&query=Das%2C+S">Soujit Das</a>, <a href="/search/cs?searchtype=author&query=Saha%2C+S">Soumik Saha</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.05813v1-abstract-short" style="display: inline;"> This paper explores the application of FilmFrenzy, a python based ticket booking web application, in the revival of traditional Indian theatres. Additionally, this research paper explores how NLP can be implemented to improve user experience. Through clarifying audience views and pinpointing opportunities for development, FilmFrenzy aims to promote involvement and rejuvenation in India's conventio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.05813v1-abstract-full').style.display = 'inline'; document.getElementById('2405.05813v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.05813v1-abstract-full" style="display: none;"> This paper explores the application of FilmFrenzy, a python based ticket booking web application, in the revival of traditional Indian theatres. Additionally, this research paper explores how NLP can be implemented to improve user experience. Through clarifying audience views and pinpointing opportunities for development, FilmFrenzy aims to promote involvement and rejuvenation in India's conventional theatre scene. The platform seeks to maintain the relevance and vitality of conventional theatres by bridging the gap between audiences and them through the incorporation of contemporary technologies, especially NLP. This research envisions a future in which technology plays a crucial part in maintaining India's rich theatrical traditions, thereby contributing to the preservation and development of cultural heritage. With sentiment analysis and natural language processing (NLP) as essential instruments for improving stagecraft, the research envisions a period when traditional theatre will still be vibrant. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.05813v1-abstract-full').style.display = 'none'; document.getElementById('2405.05813v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.04610">arXiv:2405.04610</a> <span> [<a href="https://arxiv.org/pdf/2405.04610">pdf</a>, <a href="https://arxiv.org/format/2405.04610">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Exploring Explainable AI Techniques for Improved Interpretability in Lung and Colon Cancer Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Moin%2C+M+B">Mukaffi Bin Moin</a>, <a href="/search/cs?searchtype=author&query=Faria%2C+F+T+J">Fatema Tuj Johora Faria</a>, <a href="/search/cs?searchtype=author&query=Saha%2C+S">Swarnajit Saha</a>, <a href="/search/cs?searchtype=author&query=Rafa%2C+B+K">Busra Kamal Rafa</a>, <a href="/search/cs?searchtype=author&query=Alam%2C+M+S">Mohammad Shafiul Alam</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.04610v2-abstract-short" style="display: inline;"> Lung and colon cancer are serious worldwide health challenges that require early and precise identification to reduce mortality risks. However, diagnosis, which is mostly dependent on histopathologists' competence, presents difficulties and hazards when expertise is insufficient. While diagnostic methods like imaging and blood markers contribute to early detection, histopathology remains the gold… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.04610v2-abstract-full').style.display = 'inline'; document.getElementById('2405.04610v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.04610v2-abstract-full" style="display: none;"> Lung and colon cancer are serious worldwide health challenges that require early and precise identification to reduce mortality risks. However, diagnosis, which is mostly dependent on histopathologists' competence, presents difficulties and hazards when expertise is insufficient. While diagnostic methods like imaging and blood markers contribute to early detection, histopathology remains the gold standard, although time-consuming and vulnerable to inter-observer mistakes. Limited access to high-end technology further limits patients' ability to receive immediate medical care and diagnosis. Recent advances in deep learning have generated interest in its application to medical imaging analysis, specifically the use of histopathological images to diagnose lung and colon cancer. The goal of this investigation is to use and adapt existing pre-trained CNN-based models, such as Xception, DenseNet201, ResNet101, InceptionV3, DenseNet121, DenseNet169, ResNet152, and InceptionResNetV2, to enhance classification through better augmentation strategies. The results show tremendous progress, with all eight models reaching impressive accuracy ranging from 97% to 99%. Furthermore, attention visualization techniques such as GradCAM, GradCAM++, ScoreCAM, Faster Score-CAM, and LayerCAM, as well as Vanilla Saliency and SmoothGrad, are used to provide insights into the models' classification decisions, thereby improving interpretability and understanding of malignant and benign image classification. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.04610v2-abstract-full').style.display = 'none'; document.getElementById('2405.04610v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted in 4th International Conference on Computing and Communication Networks (ICCCNet-2024)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.19306">arXiv:2404.19306</a> <span> [<a href="https://arxiv.org/pdf/2404.19306">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Comprehensive Forecasting-Based Analysis of Hybrid and Stacked Stateful/ Stateless Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Saha%2C+S">Swayamjit Saha</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.19306v1-abstract-short" style="display: inline;"> Wind speed is a powerful source of renewable energy, which can be used as an alternative to the non-renewable resources for production of electricity. Renewable sources are clean, infinite and do not impact the environment negatively during production of electrical energy. However, while eliciting electrical energy from renewable resources viz. solar irradiance, wind speed, hydro should require sp… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.19306v1-abstract-full').style.display = 'inline'; document.getElementById('2404.19306v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.19306v1-abstract-full" style="display: none;"> Wind speed is a powerful source of renewable energy, which can be used as an alternative to the non-renewable resources for production of electricity. Renewable sources are clean, infinite and do not impact the environment negatively during production of electrical energy. However, while eliciting electrical energy from renewable resources viz. solar irradiance, wind speed, hydro should require special planning failing which may result in huge loss of labour and money for setting up the system. In this paper, we discuss four deep recurrent neural networks viz. Stacked Stateless LSTM, Stacked Stateless GRU, Stacked Stateful LSTM and Statcked Stateful GRU which will be used to predict wind speed on a short-term basis for the airport sites beside two campuses of Mississippi State University. The paper does a comprehensive analysis of the performance of the models used describing their architectures and how efficiently they elicit the results with the help of RMSE values. A detailed description of the time and space complexities of the above models has also been discussed. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.19306v1-abstract-full').style.display = 'none'; document.getElementById('2404.19306v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 14 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.18546">arXiv:2404.18546</a> <span> [<a href="https://arxiv.org/pdf/2404.18546">pdf</a>, <a href="https://arxiv.org/format/2404.18546">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> ir_explain: a Python Library of Explainable IR Methods </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Saha%2C+S">Sourav Saha</a>, <a href="/search/cs?searchtype=author&query=Agarwal%2C+H">Harsh Agarwal</a>, <a href="/search/cs?searchtype=author&query=Mohanty%2C+S">Swastik Mohanty</a>, <a href="/search/cs?searchtype=author&query=Mitra%2C+M">Mandar Mitra</a>, <a href="/search/cs?searchtype=author&query=Majumdar%2C+D">Debapriyo Majumdar</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.18546v1-abstract-short" style="display: inline;"> While recent advancements in Neural Ranking Models have resulted in significant improvements over traditional statistical retrieval models, it is generally acknowledged that the use of large neural architectures and the application of complex language models in Information Retrieval (IR) have reduced the transparency of retrieval methods. Consequently, Explainability and Interpretability have emer… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.18546v1-abstract-full').style.display = 'inline'; document.getElementById('2404.18546v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.18546v1-abstract-full" style="display: none;"> While recent advancements in Neural Ranking Models have resulted in significant improvements over traditional statistical retrieval models, it is generally acknowledged that the use of large neural architectures and the application of complex language models in Information Retrieval (IR) have reduced the transparency of retrieval methods. Consequently, Explainability and Interpretability have emerged as important research topics in IR. Several axiomatic and post-hoc explanation methods, as well as approaches that attempt to be interpretable-by-design, have been proposed. This article presents \irexplain, an open-source Python library that implements a variety of well-known techniques for Explainable IR (ExIR) within a common, extensible framework. \irexplain supports the three standard categories of post-hoc explanations, namely pointwise, pairwise, and listwise explanations. The library is designed to make it easy to reproduce state-of-the-art ExIR baselines on standard test collections, as well as to explore new approaches to explaining IR models and methods. To facilitate adoption, \irexplain is well-integrated with widely-used toolkits such as Pyserini and \irdatasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.18546v1-abstract-full').style.display = 'none'; document.getElementById('2404.18546v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.10296">arXiv:2404.10296</a> <span> [<a href="https://arxiv.org/pdf/2404.10296">pdf</a>, <a href="https://arxiv.org/format/2404.10296">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> </div> </div> <p class="title is-5 mathjax"> Interpolating neural network: A lightweight yet precise architecture for data training, equation solving, and parameter calibration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Park%2C+C">Chanwook Park</a>, <a href="/search/cs?searchtype=author&query=Saha%2C+S">Sourav Saha</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+J">Jiachen Guo</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hantao Zhang</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+X">Xiaoyu Xie</a>, <a href="/search/cs?searchtype=author&query=Bessa%2C+M+A">Miguel A. Bessa</a>, <a href="/search/cs?searchtype=author&query=Qian%2C+D">Dong Qian</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wei Chen</a>, <a href="/search/cs?searchtype=author&query=Wagner%2C+G+J">Gregory J. Wagner</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+J">Jian Cao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+W+K">Wing Kam Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.10296v4-abstract-short" style="display: inline;"> Artificial intelligence (AI) has revolutionized software development, shifting from task-specific codes (Software 1.0) to neural network-based approaches (Software 2.0). However, applying this transition in engineering software presents challenges, including low surrogate model accuracy, the curse of dimensionality in inverse design, and rising complexity in physical simulations. We introduce an i… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.10296v4-abstract-full').style.display = 'inline'; document.getElementById('2404.10296v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.10296v4-abstract-full" style="display: none;"> Artificial intelligence (AI) has revolutionized software development, shifting from task-specific codes (Software 1.0) to neural network-based approaches (Software 2.0). However, applying this transition in engineering software presents challenges, including low surrogate model accuracy, the curse of dimensionality in inverse design, and rising complexity in physical simulations. We introduce an interpolating neural network (INN), grounded in interpolation theory and tensor decomposition, to realize Engineering Software 2.0 by advancing data training, partial differential equation solving, and parameter calibration. INN offers orders of magnitude fewer trainable/solvable parameters for comparable model accuracy than traditional multi-layer perceptron (MLP) or physics-informed neural networks (PINN). Demonstrated in metal additive manufacturing, INN rapidly constructs an accurate surrogate model of Laser Powder Bed Fusion (L-PBF) heat transfer simulation, achieving sub-10-micrometer resolution for a 10 mm path in under 15 minutes on a single GPU. This makes a transformative step forward across all domains essential to engineering software. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.10296v4-abstract-full').style.display = 'none'; document.getElementById('2404.10296v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages, 2 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.07410">arXiv:2404.07410</a> <span> [<a href="https://arxiv.org/pdf/2404.07410">pdf</a>, <a href="https://arxiv.org/format/2404.07410">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Improving Shift Invariance in Convolutional Neural Networks with Translation Invariant Polyphase Sampling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Saha%2C+S">Sourajit Saha</a>, <a href="/search/cs?searchtype=author&query=Gokhale%2C+T">Tejas Gokhale</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.07410v1-abstract-short" style="display: inline;"> Downsampling operators break the shift invariance of convolutional neural networks (CNNs) and this affects the robustness of features learned by CNNs when dealing with even small pixel-level shift. Through a large-scale correlation analysis framework, we study shift invariance of CNNs by inspecting existing downsampling operators in terms of their maximum-sampling bias (MSB), and find that MSB is… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.07410v1-abstract-full').style.display = 'inline'; document.getElementById('2404.07410v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.07410v1-abstract-full" style="display: none;"> Downsampling operators break the shift invariance of convolutional neural networks (CNNs) and this affects the robustness of features learned by CNNs when dealing with even small pixel-level shift. Through a large-scale correlation analysis framework, we study shift invariance of CNNs by inspecting existing downsampling operators in terms of their maximum-sampling bias (MSB), and find that MSB is negatively correlated with shift invariance. Based on this crucial insight, we propose a learnable pooling operator called Translation Invariant Polyphase Sampling (TIPS) and two regularizations on the intermediate feature maps of TIPS to reduce MSB and learn translation-invariant representations. TIPS can be integrated into any CNN and can be trained end-to-end with marginal computational overhead. Our experiments demonstrate that TIPS results in consistent performance gains in terms of accuracy, shift consistency, and shift fidelity on multiple benchmarks for image classification and semantic segmentation compared to previous methods and also leads to improvements in adversarial and distributional robustness. TIPS results in the lowest MSB compared to all previous methods, thus explaining our strong empirical results. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.07410v1-abstract-full').style.display = 'none'; document.getElementById('2404.07410v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.07214">arXiv:2404.07214</a> <span> [<a href="https://arxiv.org/pdf/2404.07214">pdf</a>, <a href="https://arxiv.org/format/2404.07214">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Exploring the Frontier of Vision-Language Models: A Survey of Current Methodologies and Future Directions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ghosh%2C+A">Akash Ghosh</a>, <a href="/search/cs?searchtype=author&query=Acharya%2C+A">Arkadeep Acharya</a>, <a href="/search/cs?searchtype=author&query=Saha%2C+S">Sriparna Saha</a>, <a href="/search/cs?searchtype=author&query=Jain%2C+V">Vinija Jain</a>, <a href="/search/cs?searchtype=author&query=Chadha%2C+A">Aman Chadha</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.07214v2-abstract-short" style="display: inline;"> The advent of Large Language Models (LLMs) has significantly reshaped the trajectory of the AI revolution. Nevertheless, these LLMs exhibit a notable limitation, as they are primarily adept at processing textual information. To address this constraint, researchers have endeavored to integrate visual capabilities with LLMs, resulting in the emergence of Vision-Language Models (VLMs). These advanced… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.07214v2-abstract-full').style.display = 'inline'; document.getElementById('2404.07214v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.07214v2-abstract-full" style="display: none;"> The advent of Large Language Models (LLMs) has significantly reshaped the trajectory of the AI revolution. Nevertheless, these LLMs exhibit a notable limitation, as they are primarily adept at processing textual information. To address this constraint, researchers have endeavored to integrate visual capabilities with LLMs, resulting in the emergence of Vision-Language Models (VLMs). These advanced models are instrumental in tackling more intricate tasks such as image captioning and visual question answering. In our comprehensive survey paper, we delve into the key advancements within the realm of VLMs. Our classification organizes VLMs into three distinct categories: models dedicated to vision-language understanding, models that process multimodal inputs to generate unimodal (textual) outputs and models that both accept and produce multimodal inputs and outputs.This classification is based on their respective capabilities and functionalities in processing and generating various modalities of data.We meticulously dissect each model, offering an extensive analysis of its foundational architecture, training data sources, as well as its strengths and limitations wherever possible, providing readers with a comprehensive understanding of its essential components. We also analyzed the performance of VLMs in various benchmark datasets. By doing so, we aim to offer a nuanced understanding of the diverse landscape of VLMs. Additionally, we underscore potential avenues for future research in this dynamic domain, anticipating further breakthroughs and advancements. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.07214v2-abstract-full').style.display = 'none'; document.getElementById('2404.07214v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The most extensive and up to date Survey on Visual Language Models covering 76 Visual Language Models</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.03799">arXiv:2404.03799</a> <span> [<a href="https://arxiv.org/pdf/2404.03799">pdf</a>, <a href="https://arxiv.org/format/2404.03799">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Language-Guided Instance-Aware Domain-Adaptive Panoptic Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mansour%2C+E+A">Elham Amin Mansour</a>, <a href="/search/cs?searchtype=author&query=Unal%2C+O">Ozan Unal</a>, <a href="/search/cs?searchtype=author&query=Saha%2C+S">Suman Saha</a>, <a href="/search/cs?searchtype=author&query=Bejar%2C+B">Benjamin Bejar</a>, <a href="/search/cs?searchtype=author&query=Van+Gool%2C+L">Luc Van Gool</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.03799v1-abstract-short" style="display: inline;"> The increasing relevance of panoptic segmentation is tied to the advancements in autonomous driving and AR/VR applications. However, the deployment of such models has been limited due to the expensive nature of dense data annotation, giving rise to unsupervised domain adaptation (UDA). A key challenge in panoptic UDA is reducing the domain gap between a labeled source and an unlabeled target domai… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.03799v1-abstract-full').style.display = 'inline'; document.getElementById('2404.03799v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.03799v1-abstract-full" style="display: none;"> The increasing relevance of panoptic segmentation is tied to the advancements in autonomous driving and AR/VR applications. However, the deployment of such models has been limited due to the expensive nature of dense data annotation, giving rise to unsupervised domain adaptation (UDA). A key challenge in panoptic UDA is reducing the domain gap between a labeled source and an unlabeled target domain while harmonizing the subtasks of semantic and instance segmentation to limit catastrophic interference. While considerable progress has been achieved, existing approaches mainly focus on the adaptation of semantic segmentation. In this work, we focus on incorporating instance-level adaptation via a novel instance-aware cross-domain mixing strategy IMix. IMix significantly enhances the panoptic quality by improving instance segmentation performance. Specifically, we propose inserting high-confidence predicted instances from the target domain onto source images, retaining the exhaustiveness of the resulting pseudo-labels while reducing the injected confirmation bias. Nevertheless, such an enhancement comes at the cost of degraded semantic performance, attributed to catastrophic forgetting. To mitigate this issue, we regularize our semantic branch by employing CLIP-based domain alignment (CDA), exploiting the domain-robustness of natural language prompts. Finally, we present an end-to-end model incorporating these two mechanisms called LIDAPS, achieving state-of-the-art results on all popular panoptic UDA benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.03799v1-abstract-full').style.display = 'none'; document.getElementById('2404.03799v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.00471">arXiv:2404.00471</a> <span> [<a href="https://arxiv.org/pdf/2404.00471">pdf</a>, <a href="https://arxiv.org/format/2404.00471">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Medical Physics">physics.med-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/ICASSP48485.2024.10447579">10.1109/ICASSP48485.2024.10447579 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Score-Based Diffusion Models for Photoacoustic Tomography Image Reconstruction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dey%2C+S">Sreemanti Dey</a>, <a href="/search/cs?searchtype=author&query=Saha%2C+S">Snigdha Saha</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+B+T">Berthy T. Feng</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+M">Manxiu Cui</a>, <a href="/search/cs?searchtype=author&query=Delisle%2C+L">Laure Delisle</a>, <a href="/search/cs?searchtype=author&query=Leong%2C+O">Oscar Leong</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L+V">Lihong V. Wang</a>, <a href="/search/cs?searchtype=author&query=Bouman%2C+K+L">Katherine L. Bouman</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.00471v1-abstract-short" style="display: inline;"> Photoacoustic tomography (PAT) is a rapidly-evolving medical imaging modality that combines optical absorption contrast with ultrasound imaging depth. One challenge in PAT is image reconstruction with inadequate acoustic signals due to limited sensor coverage or due to the density of the transducer array. Such cases call for solving an ill-posed inverse reconstruction problem. In this work, we use… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.00471v1-abstract-full').style.display = 'inline'; document.getElementById('2404.00471v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.00471v1-abstract-full" style="display: none;"> Photoacoustic tomography (PAT) is a rapidly-evolving medical imaging modality that combines optical absorption contrast with ultrasound imaging depth. One challenge in PAT is image reconstruction with inadequate acoustic signals due to limited sensor coverage or due to the density of the transducer array. Such cases call for solving an ill-posed inverse reconstruction problem. In this work, we use score-based diffusion models to solve the inverse problem of reconstructing an image from limited PAT measurements. The proposed approach allows us to incorporate an expressive prior learned by a diffusion model on simulated vessel structures while still being robust to varying transducer sparsity conditions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.00471v1-abstract-full').style.display = 'none'; document.getElementById('2404.00471v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), Seoul, Korea, Republic of, 2024, pp. 2470-2474 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.14290">arXiv:2403.14290</a> <span> [<a href="https://arxiv.org/pdf/2403.14290">pdf</a>, <a href="https://arxiv.org/format/2403.14290">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Exploring Green AI for Audio Deepfake Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Saha%2C+S">Subhajit Saha</a>, <a href="/search/cs?searchtype=author&query=Sahidullah%2C+M">Md Sahidullah</a>, <a href="/search/cs?searchtype=author&query=Das%2C+S">Swagatam Das</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.14290v1-abstract-short" style="display: inline;"> The state-of-the-art audio deepfake detectors leveraging deep neural networks exhibit impressive recognition performance. Nonetheless, this advantage is accompanied by a significant carbon footprint. This is mainly due to the use of high-performance computing with accelerators and high training time. Studies show that average deep NLP model produces around 626k lbs of CO\textsubscript{2} which is… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.14290v1-abstract-full').style.display = 'inline'; document.getElementById('2403.14290v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.14290v1-abstract-full" style="display: none;"> The state-of-the-art audio deepfake detectors leveraging deep neural networks exhibit impressive recognition performance. Nonetheless, this advantage is accompanied by a significant carbon footprint. This is mainly due to the use of high-performance computing with accelerators and high training time. Studies show that average deep NLP model produces around 626k lbs of CO\textsubscript{2} which is equivalent to five times of average US car emission at its lifetime. This is certainly a massive threat to the environment. To tackle this challenge, this study presents a novel framework for audio deepfake detection that can be seamlessly trained using standard CPU resources. Our proposed framework utilizes off-the-shelve self-supervised learning (SSL) based models which are pre-trained and available in public repositories. In contrast to existing methods that fine-tune SSL models and employ additional deep neural networks for downstream tasks, we exploit classical machine learning algorithms such as logistic regression and shallow neural networks using the SSL embeddings extracted using the pre-trained model. Our approach shows competitive results compared to the commonly used high-carbon footprint approaches. In experiments with the ASVspoof 2019 LA dataset, we achieve a 0.90\% equal error rate (EER) with less than 1k trainable model parameters. To encourage further research in this direction and support reproducible results, the Python code will be made publicly accessible following acceptance. Github: https://github.com/sahasubhajit/Speech-Spoofing- <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.14290v1-abstract-full').style.display = 'none'; document.getElementById('2403.14290v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This manuscript is under review in a conference</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.00643">arXiv:2403.00643</a> <span> [<a href="https://arxiv.org/pdf/2403.00643">pdf</a>, <a href="https://arxiv.org/ps/2403.00643">ps</a>, <a href="https://arxiv.org/format/2403.00643">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Data Structures and Algorithms">cs.DS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computational Complexity">cs.CC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Numerical Analysis">math.NA</span> </div> </div> <p class="title is-5 mathjax"> Undercomplete Decomposition of Symmetric Tensors in Linear Time, and Smoothed Analysis of the Condition Number </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Koiran%2C+P">Pascal Koiran</a>, <a href="/search/cs?searchtype=author&query=Saha%2C+S">Subhayan Saha</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.00643v1-abstract-short" style="display: inline;"> We study symmetric tensor decompositions, i.e., decompositions of the form $T = \sum_{i=1}^r u_i^{\otimes 3}$ where $T$ is a symmetric tensor of order 3 and $u_i \in \mathbb{C}^n$.In order to obtain efficient decomposition algorithms, it is necessary to require additional properties from $u_i$. In this paper we assume that the $u_i$ are linearly independent. This implies $r \leq n$,that is, the de… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.00643v1-abstract-full').style.display = 'inline'; document.getElementById('2403.00643v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.00643v1-abstract-full" style="display: none;"> We study symmetric tensor decompositions, i.e., decompositions of the form $T = \sum_{i=1}^r u_i^{\otimes 3}$ where $T$ is a symmetric tensor of order 3 and $u_i \in \mathbb{C}^n$.In order to obtain efficient decomposition algorithms, it is necessary to require additional properties from $u_i$. In this paper we assume that the $u_i$ are linearly independent. This implies $r \leq n$,that is, the decomposition of T is undercomplete. We give a randomized algorithm for the following problem in the exact arithmetic model of computation: Let $T$ be an order-3 symmetric tensor that has an undercomplete decomposition.Then given some $T'$ close to $T$, an accuracy parameter $\varepsilon$, and an upper bound B on the condition number of the tensor, output vectors $u'_i$ such that $||u_i - u'_i|| \leq \varepsilon$ (up to permutation and multiplication by cube roots of unity) with high probability. The main novel features of our algorithm are: 1) We provide the first algorithm for this problem that runs in linear time in the size of the input tensor. More specifically, it requires $O(n^3)$ arithmetic operations for all accuracy parameters $\varepsilon =$ 1/poly(n) and B = poly(n). 2) Our algorithm is robust, that is, it can handle inverse-quasi-polynomial noise (in $n$,B,$\frac{1}{\varepsilon}$) in the input tensor. 3) We present a smoothed analysis of the condition number of the tensor decomposition problem. This guarantees that the condition number is low with high probability and further shows that our algorithm runs in linear time, except for some rare badly conditioned inputs. Our main algorithm is a reduction to the complete case ($r=n$) treated in our previous work [Koiran,Saha,CIAC 2023]. For efficiency reasons we cannot use this algorithm as a blackbox. Instead, we show that it can be run on an implicitly represented tensor obtained from the input tensor by a change of basis. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.00643v1-abstract-full').style.display = 'none'; document.getElementById('2403.00643v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">55 pages</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68W20; 68W40; 65F35; 15A69 <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> F.2.1; G.1.3 </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Saha%2C+S&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Saha%2C+S&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Saha%2C+S&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Saha%2C+S&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Saha%2C+S&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Saha%2C+S&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li> <a href="/search/?searchtype=author&query=Saha%2C+S&start=250" class="pagination-link " aria-label="Page 6" aria-current="page">6 </a> </li> <li> <a href="/search/?searchtype=author&query=Saha%2C+S&start=300" class="pagination-link " aria-label="Page 7" aria-current="page">7 </a> </li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>