Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 1,442 results for author: <span class="mathjax">Park, J</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Park%2C+J">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Park, J"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Park%2C+J&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Park, J"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Park%2C+J&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Park%2C+J&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Park%2C+J&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Park%2C+J&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Park%2C+J&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Park%2C+J&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14042">arXiv:2411.14042</a> <span> [<a href="https://arxiv.org/pdf/2411.14042">pdf</a>, <a href="https://arxiv.org/format/2411.14042">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Forecasting Future International Events: A Reliable Dataset for Text-Based Event Modeling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gwak%2C+D">Daehoon Gwak</a>, <a href="/search/cs?searchtype=author&query=Park%2C+J">Junwoo Park</a>, <a href="/search/cs?searchtype=author&query=Park%2C+M">Minho Park</a>, <a href="/search/cs?searchtype=author&query=Park%2C+C">Chaehun Park</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+H">Hyunchan Lee</a>, <a href="/search/cs?searchtype=author&query=Choi%2C+E">Edward Choi</a>, <a href="/search/cs?searchtype=author&query=Choo%2C+J">Jaegul Choo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14042v1-abstract-short" style="display: inline;"> Predicting future international events from textual information, such as news articles, has tremendous potential for applications in global policy, strategic decision-making, and geopolitics. However, existing datasets available for this task are often limited in quality, hindering the progress of related research. In this paper, we introduce WORLDREP (WORLD Relationship and Event Prediction), a n… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14042v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14042v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14042v1-abstract-full" style="display: none;"> Predicting future international events from textual information, such as news articles, has tremendous potential for applications in global policy, strategic decision-making, and geopolitics. However, existing datasets available for this task are often limited in quality, hindering the progress of related research. In this paper, we introduce WORLDREP (WORLD Relationship and Event Prediction), a novel dataset designed to address these limitations by leveraging the advanced reasoning capabilities of large-language models (LLMs). Our dataset features high-quality scoring labels generated through advanced prompt modeling and rigorously validated by domain experts in political science. We showcase the quality and utility of WORLDREP for real-world event prediction tasks, demonstrating its effectiveness through extensive experiments and analysis. Furthermore, we publicly release our dataset along with the full automation source code for data collection, labeling, and benchmarking, aiming to support and advance research in text-based event prediction. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14042v1-abstract-full').style.display = 'none'; document.getElementById('2411.14042v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">EMNLP 2024 Findings</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12199">arXiv:2411.12199</a> <span> [<a href="https://arxiv.org/pdf/2411.12199">pdf</a>, <a href="https://arxiv.org/format/2411.12199">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> RoSIS: Robust Framework for Text-Promptable Surgical Instrument Segmentation Using Vision-Language Fusion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Choi%2C+T">Tae-Min Choi</a>, <a href="/search/cs?searchtype=author&query=Park%2C+J">Juyoun Park</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12199v1-abstract-short" style="display: inline;"> Surgical instrument segmentation (SIS) is an essential task in computer-assisted surgeries, with deep learning-based research improving accuracy in complex environments. Recently, text-promptable segmentation methods have been introduced to generate masks based on text prompts describing target objects. However, these methods assume that the object described by a given text prompt exists in the sc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12199v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12199v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12199v1-abstract-full" style="display: none;"> Surgical instrument segmentation (SIS) is an essential task in computer-assisted surgeries, with deep learning-based research improving accuracy in complex environments. Recently, text-promptable segmentation methods have been introduced to generate masks based on text prompts describing target objects. However, these methods assume that the object described by a given text prompt exists in the scene. This results in mask generation whenever a related text prompt is provided, even if the object is absent from the image. Existing methods handle this by using prompts only for objects known to be present in the image, which introduces inaccessible information in a vision-based method setting and results in unfair comparisons. For fair comparison, we redefine existing text-promptable SIS settings to robust conditions, called Robust text-promptable SIS (R-SIS), designed to forward prompts of all classes and determine the existence of an object from a given text prompt for the fair comparison. Furthermore, we propose a novel framework, Robust Surgical Instrument Segmentation (RoSIS), which combines visual and language features for promptable segmentation in the R-SIS setting. RoSIS employs an encoder-decoder architecture with a Multi-Modal Fusion Block (MMFB) and a Selective Gate Block (SGB) to achieve balanced integration of vision and language features. Additionally, we introduce an iterative inference strategy that refines segmentation masks in two steps: an initial pass using name-based prompts, followed by a refinement step using location prompts. Experiments on various datasets and settings demonstrate that RoSIS outperforms existing vision-based and promptable methods under robust conditions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12199v1-abstract-full').style.display = 'none'; document.getElementById('2411.12199v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 6 figures, submitted to IEEE transactions on Medical Imaging</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11283">arXiv:2411.11283</a> <span> [<a href="https://arxiv.org/pdf/2411.11283">pdf</a>, <a href="https://arxiv.org/format/2411.11283">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Multi-Hyperbolic Space-based Heterogeneous Graph Attention Network </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Park%2C+J">Jongmin Park</a>, <a href="/search/cs?searchtype=author&query=Han%2C+S">Seunghoon Han</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+J">Jong-Ryul Lee</a>, <a href="/search/cs?searchtype=author&query=Lim%2C+S">Sungsu Lim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11283v1-abstract-short" style="display: inline;"> To leverage the complex structures within heterogeneous graphs, recent studies on heterogeneous graph embedding use a hyperbolic space, characterized by a constant negative curvature and exponentially increasing space, which aligns with the structural properties of heterogeneous graphs. However, despite heterogeneous graphs inherently possessing diverse power-law structures, most hyperbolic hetero… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11283v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11283v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11283v1-abstract-full" style="display: none;"> To leverage the complex structures within heterogeneous graphs, recent studies on heterogeneous graph embedding use a hyperbolic space, characterized by a constant negative curvature and exponentially increasing space, which aligns with the structural properties of heterogeneous graphs. However, despite heterogeneous graphs inherently possessing diverse power-law structures, most hyperbolic heterogeneous graph embedding models use a single hyperbolic space for the entire heterogeneous graph, which may not effectively capture the diverse power-law structures within the heterogeneous graph. To address this limitation, we propose Multi-hyperbolic Space-based heterogeneous Graph Attention Network (MSGAT), which uses multiple hyperbolic spaces to effectively capture diverse power-law structures within heterogeneous graphs. We conduct comprehensive experiments to evaluate the effectiveness of MSGAT. The experimental results demonstrate that MSGAT outperforms state-of-the-art baselines in various graph machine learning tasks, effectively capturing the complex structures of heterogeneous graphs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11283v1-abstract-full').style.display = 'none'; document.getElementById('2411.11283v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted in IEEE ICDM 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10957">arXiv:2411.10957</a> <span> [<a href="https://arxiv.org/pdf/2411.10957">pdf</a>, <a href="https://arxiv.org/format/2411.10957">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> IMPaCT GNN: Imposing invariance with Message Passing in Chronological split Temporal Graphs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Park%2C+S">Sejun Park</a>, <a href="/search/cs?searchtype=author&query=Park%2C+J+Y">Joo Young Park</a>, <a href="/search/cs?searchtype=author&query=Park%2C+H">Hyunwoo Park</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10957v1-abstract-short" style="display: inline;"> This paper addresses domain adaptation challenges in graph data resulting from chronological splits. In a transductive graph learning setting, where each node is associated with a timestamp, we focus on the task of Semi-Supervised Node Classification (SSNC), aiming to classify recent nodes using labels of past nodes. Temporal dependencies in node connections create domain shifts, causing significa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10957v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10957v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10957v1-abstract-full" style="display: none;"> This paper addresses domain adaptation challenges in graph data resulting from chronological splits. In a transductive graph learning setting, where each node is associated with a timestamp, we focus on the task of Semi-Supervised Node Classification (SSNC), aiming to classify recent nodes using labels of past nodes. Temporal dependencies in node connections create domain shifts, causing significant performance degradation when applying models trained on historical data into recent data. Given the practical relevance of this scenario, addressing domain adaptation in chronological split data is crucial, yet underexplored. We propose Imposing invariance with Message Passing in Chronological split Temporal Graphs (IMPaCT), a method that imposes invariant properties based on realistic assumptions derived from temporal graph structures. Unlike traditional domain adaptation approaches which rely on unverifiable assumptions, IMPaCT explicitly accounts for the characteristics of chronological splits. The IMPaCT is further supported by rigorous mathematical analysis, including a derivation of an upper bound of the generalization error. Experimentally, IMPaCT achieves a 3.8% performance improvement over current SOTA method on the ogbn-mag graph dataset. Additionally, we introduce the Temporal Stochastic Block Model (TSBM), which replicates temporal graphs under varying conditions, demonstrating the applicability of our methods to general spatial GNNs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10957v1-abstract-full').style.display = 'none'; document.getElementById('2411.10957v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages (without appendix), 35 pages (with appendix), 14 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10927">arXiv:2411.10927</a> <span> [<a href="https://arxiv.org/pdf/2411.10927">pdf</a>, <a href="https://arxiv.org/format/2411.10927">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Inter-linguistic Phonetic Composition (IPC): A Theoretical and Computational Approach to Enhance Second Language Pronunciation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Park%2C+J">Jisang Park</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+M">Minu Kim</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+D">DaYoung Hong</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+J">Jongha Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10927v1-abstract-short" style="display: inline;"> Learners of a second language (L2) often unconsciously substitute unfamiliar L2 phonemes with similar phonemes from their native language (L1), even though native speakers of the L2 perceive these sounds as distinct and non-interchangeable. This phonemic substitution leads to deviations from the standard phonological patterns of the L2, creating challenges for learners in acquiring accurate L2 pro… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10927v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10927v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10927v1-abstract-full" style="display: none;"> Learners of a second language (L2) often unconsciously substitute unfamiliar L2 phonemes with similar phonemes from their native language (L1), even though native speakers of the L2 perceive these sounds as distinct and non-interchangeable. This phonemic substitution leads to deviations from the standard phonological patterns of the L2, creating challenges for learners in acquiring accurate L2 pronunciation. To address this, we propose Inter-linguistic Phonetic Composition (IPC), a novel computational method designed to minimize incorrect phonological transfer by reconstructing L2 phonemes as composite sounds derived from multiple L1 phonemes. Tests with two automatic speech recognition models demonstrated that when L2 speakers produced IPC-generated composite sounds, the recognition rate of target L2 phonemes improved by 20% compared to when their pronunciation was influenced by original phonological transfer patterns. The improvement was observed within a relatively shorter time frame, demonstrating rapid acquisition of the composite sound. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10927v1-abstract-full').style.display = 'none'; document.getElementById('2411.10927v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 6 Figures, submitted to ACL ARR October 2024 for NAACL 2025</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> H.5.5 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10814">arXiv:2411.10814</a> <span> [<a href="https://arxiv.org/pdf/2411.10814">pdf</a>, <a href="https://arxiv.org/ps/2411.10814">ps</a>, <a href="https://arxiv.org/format/2411.10814">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> DEAL: Decoupled Classifier with Adaptive Linear Modulation for Group Robust Early Diagnosis of MCI to AD Conversion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lee%2C+D">Donggyu Lee</a>, <a href="/search/cs?searchtype=author&query=Park%2C+J">Juhyeon Park</a>, <a href="/search/cs?searchtype=author&query=Moon%2C+T">Taesup Moon</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10814v1-abstract-short" style="display: inline;"> While deep learning-based Alzheimer's disease (AD) diagnosis has recently made significant advancements, particularly in predicting the conversion of mild cognitive impairment (MCI) to AD based on MRI images, there remains a critical gap in research regarding the group robustness of the diagnosis. Although numerous studies pointed out that deep learning-based classifiers may exhibit poor performan… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10814v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10814v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10814v1-abstract-full" style="display: none;"> While deep learning-based Alzheimer's disease (AD) diagnosis has recently made significant advancements, particularly in predicting the conversion of mild cognitive impairment (MCI) to AD based on MRI images, there remains a critical gap in research regarding the group robustness of the diagnosis. Although numerous studies pointed out that deep learning-based classifiers may exhibit poor performance in certain groups by relying on unimportant attributes, this issue has been largely overlooked in the early diagnosis of MCI to AD conversion. In this paper, we present the first comprehensive investigation of the group robustness in the early diagnosis of MCI to AD conversion using MRI images, focusing on disparities in accuracy between groups, specifically sMCI and pMCI individuals divided by age. Our experiments reveal that standard classifiers consistently underperform for certain groups across different architectures, highlighting the need for more tailored approaches. To address this, we propose a novel method, dubbed DEAL (DEcoupled classifier with Adaptive Linear modulation), comprising two key components: (1) a linear modulation of features from the penultimate layer, incorporating easily obtainable age and cognitive indicative tabular features, and (2) a decoupled classifier that provides more tailored decision boundaries for each group, further improving performance. Through extensive experiments and evaluations across different architectures, we demonstrate the efficacy of DEAL in improving the group robustness of the MCI to AD conversion prediction. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10814v1-abstract-full').style.display = 'none'; document.getElementById('2411.10814v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Under Review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10109">arXiv:2411.10109</a> <span> [<a href="https://arxiv.org/pdf/2411.10109">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Generative Agent Simulations of 1,000 People </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Park%2C+J+S">Joon Sung Park</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+C+Q">Carolyn Q. Zou</a>, <a href="/search/cs?searchtype=author&query=Shaw%2C+A">Aaron Shaw</a>, <a href="/search/cs?searchtype=author&query=Hill%2C+B+M">Benjamin Mako Hill</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+C">Carrie Cai</a>, <a href="/search/cs?searchtype=author&query=Morris%2C+M+R">Meredith Ringel Morris</a>, <a href="/search/cs?searchtype=author&query=Willer%2C+R">Robb Willer</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+P">Percy Liang</a>, <a href="/search/cs?searchtype=author&query=Bernstein%2C+M+S">Michael S. Bernstein</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10109v1-abstract-short" style="display: inline;"> The promise of human behavioral simulation--general-purpose computational agents that replicate human behavior across domains--could enable broad applications in policymaking and social science. We present a novel agent architecture that simulates the attitudes and behaviors of 1,052 real individuals--applying large language models to qualitative interviews about their lives, then measuring how we… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10109v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10109v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10109v1-abstract-full" style="display: none;"> The promise of human behavioral simulation--general-purpose computational agents that replicate human behavior across domains--could enable broad applications in policymaking and social science. We present a novel agent architecture that simulates the attitudes and behaviors of 1,052 real individuals--applying large language models to qualitative interviews about their lives, then measuring how well these agents replicate the attitudes and behaviors of the individuals that they represent. The generative agents replicate participants' responses on the General Social Survey 85% as accurately as participants replicate their own answers two weeks later, and perform comparably in predicting personality traits and outcomes in experimental replications. Our architecture reduces accuracy biases across racial and ideological groups compared to agents given demographic descriptions. This work provides a foundation for new tools that can help investigate individual and collective behavior. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10109v1-abstract-full').style.display = 'none'; document.getElementById('2411.10109v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09909">arXiv:2411.09909</a> <span> [<a href="https://arxiv.org/pdf/2411.09909">pdf</a>, <a href="https://arxiv.org/format/2411.09909">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> AMXFP4: Taming Activation Outliers with Asymmetric Microscaling Floating-Point for 4-bit LLM Inference </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lee%2C+J">Janghwan Lee</a>, <a href="/search/cs?searchtype=author&query=Park%2C+J">Jiwoong Park</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+J">Jinseok Kim</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+Y">Yongjik Kim</a>, <a href="/search/cs?searchtype=author&query=Oh%2C+J">Jungju Oh</a>, <a href="/search/cs?searchtype=author&query=Oh%2C+J">Jinwook Oh</a>, <a href="/search/cs?searchtype=author&query=Choi%2C+J">Jungwook Choi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09909v1-abstract-short" style="display: inline;"> Scaling Large Language Models (LLMs) with extended context lengths has increased the need for efficient low-bit quantization to manage their substantial computational demands. However, reducing precision to 4 bits frequently degrades performance due to activation outliers. To address this, we propose Asymmetric Microscaling 4-bit Floating-Point (AMXFP4) for efficient LLM inference. This novel data… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09909v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09909v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09909v1-abstract-full" style="display: none;"> Scaling Large Language Models (LLMs) with extended context lengths has increased the need for efficient low-bit quantization to manage their substantial computational demands. However, reducing precision to 4 bits frequently degrades performance due to activation outliers. To address this, we propose Asymmetric Microscaling 4-bit Floating-Point (AMXFP4) for efficient LLM inference. This novel data format leverages asymmetric shared scales to mitigate outliers while naturally capturing the asymmetry introduced by group-wise quantization. Unlike conventional 4-bit quantization methods that rely on data rotation and costly calibration, AMXFP4 uses asymmetric shared scales for direct 4-bit casting, achieving near-ideal quantization accuracy across various LLM tasks, including multi-turn conversations, long-context reasoning, and visual question answering. Our AMXFP4 format significantly outperforms MXFP4 and other leading quantization techniques, enabling robust, calibration-free 4-bit inference. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09909v1-abstract-full').style.display = 'none'; document.getElementById('2411.09909v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09473">arXiv:2411.09473</a> <span> [<a href="https://arxiv.org/pdf/2411.09473">pdf</a>, <a href="https://arxiv.org/format/2411.09473">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Data Structures and Algorithms">cs.DS</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Scalability and Performance in Influence Maximization with Optimized Parallel Processing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+H">Hanjiang Wu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+H">Huan Xu</a>, <a href="/search/cs?searchtype=author&query=Park%2C+J">Joongun Park</a>, <a href="/search/cs?searchtype=author&query=Tithi%2C+J+J">Jesmin Jahan Tithi</a>, <a href="/search/cs?searchtype=author&query=Checconi%2C+F">Fabio Checconi</a>, <a href="/search/cs?searchtype=author&query=Wolfson-Pou%2C+J">Jordi Wolfson-Pou</a>, <a href="/search/cs?searchtype=author&query=Petrini%2C+F">Fabrizio Petrini</a>, <a href="/search/cs?searchtype=author&query=Krishna%2C+T">Tushar Krishna</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09473v1-abstract-short" style="display: inline;"> Influence Maximization (IM) is vital in viral marketing and biological network analysis for identifying key influencers. Given its NP-hard nature, approximate solutions are employed. This paper addresses scalability challenges in scale-out shared memory system by focusing on the state-of-the-art Influence Maximization via Martingales (IMM) benchmark. To enhance the work efficiency of the current I… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09473v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09473v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09473v1-abstract-full" style="display: none;"> Influence Maximization (IM) is vital in viral marketing and biological network analysis for identifying key influencers. Given its NP-hard nature, approximate solutions are employed. This paper addresses scalability challenges in scale-out shared memory system by focusing on the state-of-the-art Influence Maximization via Martingales (IMM) benchmark. To enhance the work efficiency of the current IMM implementation, we propose EFFICIENTIMM with key strategies, including new parallelization scheme, NUMA-aware memory usage, dynamic load balancing and fine-grained adaptive data structures. Benchmarking on a 128-core CPU system with 8 NUMA nodes, EFFICIENTIMM demonstrated significant performance improvements, achieving an average 5.9x speedup over Ripples across 8 diverse SNAP datasets, when compared to the best execution times of the original Ripples framework. Additionally, on the Youtube graph, EFFICIENTIMM demonstrates a better memory access pattern with 357.4x reduction in L1+L2 cache misses as compared to Ripples. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09473v1-abstract-full').style.display = 'none'; document.getElementById('2411.09473v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09400">arXiv:2411.09400</a> <span> [<a href="https://arxiv.org/pdf/2411.09400">pdf</a>, <a href="https://arxiv.org/format/2411.09400">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Imagined Speech and Visual Imagery as Intuitive Paradigms for Brain-Computer Interfaces </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lee%2C+S">Seo-Hyun Lee</a>, <a href="/search/cs?searchtype=author&query=Park%2C+J">Ji-Ha Park</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+D">Deok-Seon Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09400v1-abstract-short" style="display: inline;"> Recent advancements in brain-computer interface (BCI) technology have emphasized the promise of imagined speech and visual imagery as effective paradigms for intuitive communication. This study investigates the classification performance and brain connectivity patterns associated with these paradigms, focusing on decoding accuracy across selected word classes. Sixteen participants engaged in tasks… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09400v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09400v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09400v1-abstract-full" style="display: none;"> Recent advancements in brain-computer interface (BCI) technology have emphasized the promise of imagined speech and visual imagery as effective paradigms for intuitive communication. This study investigates the classification performance and brain connectivity patterns associated with these paradigms, focusing on decoding accuracy across selected word classes. Sixteen participants engaged in tasks involving thirteen imagined speech and visual imagery classes, revealing above-chance classification accuracy for both paradigms. Variability in classification accuracy across individual classes highlights the influence of sensory and motor associations in imagined speech and vivid visual associations in visual imagery. Connectivity analysis further demonstrated increased functional connectivity in language-related and sensory regions for imagined speech, whereas visual imagery activated spatial and visual processing networks. These findings suggest the potential of imagined speech and visual imagery as an intuitive and scalable paradigm for BCI communication when selecting optimal word classes. Further exploration of the decoding outcomes for these two paradigms could provide insights for practical BCI communication. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09400v1-abstract-full').style.display = 'none'; document.getElementById('2411.09400v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">4 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09211">arXiv:2411.09211</a> <span> [<a href="https://arxiv.org/pdf/2411.09211">pdf</a>, <a href="https://arxiv.org/format/2411.09211">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Dynamic Neural Communication: Convergence of Computer Vision and Brain-Computer Interface </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Park%2C+J">Ji-Ha Park</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+S">Seo-Hyun Lee</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+S">Soowon Kim</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+S">Seong-Whan Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09211v1-abstract-short" style="display: inline;"> Interpreting human neural signals to decode static speech intentions such as text or images and dynamic speech intentions such as audio or video is showing great potential as an innovative communication tool. Human communication accompanies various features, such as articulatory movements, facial expressions, and internal speech, all of which are reflected in neural signals. However, most studies… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09211v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09211v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09211v1-abstract-full" style="display: none;"> Interpreting human neural signals to decode static speech intentions such as text or images and dynamic speech intentions such as audio or video is showing great potential as an innovative communication tool. Human communication accompanies various features, such as articulatory movements, facial expressions, and internal speech, all of which are reflected in neural signals. However, most studies only generate short or fragmented outputs, while providing informative communication by leveraging various features from neural signals remains challenging. In this study, we introduce a dynamic neural communication method that leverages current computer vision and brain-computer interface technologies. Our approach captures the user's intentions from neural signals and decodes visemes in short time steps to produce dynamic visual outputs. The results demonstrate the potential to rapidly capture and reconstruct lip movements during natural speech attempts from human neural signals, enabling dynamic neural communication through the convergence of computer vision and brain--computer interface. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09211v1-abstract-full').style.display = 'none'; document.getElementById('2411.09211v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">4 pages, 2 figures, 1 table, Name of Conference: International Conference on Brain-Computer Interface</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07461">arXiv:2411.07461</a> <span> [<a href="https://arxiv.org/pdf/2411.07461">pdf</a>, <a href="https://arxiv.org/format/2411.07461">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> BLIP3-KALE: Knowledge Augmented Large-Scale Dense Captions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Awadalla%2C+A">Anas Awadalla</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+L">Le Xue</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+M">Manli Shu</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+A">An Yan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jun Wang</a>, <a href="/search/cs?searchtype=author&query=Purushwalkam%2C+S">Senthil Purushwalkam</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+S">Sheng Shen</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+H">Hannah Lee</a>, <a href="/search/cs?searchtype=author&query=Lo%2C+O">Oscar Lo</a>, <a href="/search/cs?searchtype=author&query=Park%2C+J+S">Jae Sung Park</a>, <a href="/search/cs?searchtype=author&query=Guha%2C+E">Etash Guha</a>, <a href="/search/cs?searchtype=author&query=Savarese%2C+S">Silvio Savarese</a>, <a href="/search/cs?searchtype=author&query=Schmidt%2C+L">Ludwig Schmidt</a>, <a href="/search/cs?searchtype=author&query=Choi%2C+Y">Yejin Choi</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+C">Caiming Xiong</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+R">Ran Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07461v1-abstract-short" style="display: inline;"> We introduce BLIP3-KALE, a dataset of 218 million image-text pairs that bridges the gap between descriptive synthetic captions and factual web-scale alt-text. KALE augments synthetic dense image captions with web-scale alt-text to generate factually grounded image captions. Our two-stage approach leverages large vision-language models and language models to create knowledge-augmented captions, whi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07461v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07461v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07461v1-abstract-full" style="display: none;"> We introduce BLIP3-KALE, a dataset of 218 million image-text pairs that bridges the gap between descriptive synthetic captions and factual web-scale alt-text. KALE augments synthetic dense image captions with web-scale alt-text to generate factually grounded image captions. Our two-stage approach leverages large vision-language models and language models to create knowledge-augmented captions, which are then used to train a specialized VLM for scaling up the dataset. We train vision-language models on KALE and demonstrate improvements on vision-language tasks. Our experiments show the utility of KALE for training more capable and knowledgeable multimodal models. We release the KALE dataset at https://huggingface.co/datasets/Salesforce/blip3-kale <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07461v1-abstract-full').style.display = 'none'; document.getElementById('2411.07461v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06736">arXiv:2411.06736</a> <span> [<a href="https://arxiv.org/pdf/2411.06736">pdf</a>, <a href="https://arxiv.org/format/2411.06736">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Mr.Steve: Instruction-Following Agents in Minecraft with What-Where-When Memory </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Park%2C+J">Junyeong Park</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+J">Junmo Cho</a>, <a href="/search/cs?searchtype=author&query=Ahn%2C+S">Sungjin Ahn</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06736v2-abstract-short" style="display: inline;"> Significant advances have been made in developing general-purpose embodied AI in environments like Minecraft through the adoption of LLM-augmented hierarchical approaches. While these approaches, which combine high-level planners with low-level controllers, show promise, low-level controllers frequently become performance bottlenecks due to repeated failures. In this paper, we argue that the prima… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06736v2-abstract-full').style.display = 'inline'; document.getElementById('2411.06736v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06736v2-abstract-full" style="display: none;"> Significant advances have been made in developing general-purpose embodied AI in environments like Minecraft through the adoption of LLM-augmented hierarchical approaches. While these approaches, which combine high-level planners with low-level controllers, show promise, low-level controllers frequently become performance bottlenecks due to repeated failures. In this paper, we argue that the primary cause of failure in many low-level controllers is the absence of an episodic memory system. To address this, we introduce Mr. Steve (Memory Recall Steve-1), a novel low-level controller equipped with Place Event Memory (PEM), a form of episodic memory that captures what, where, and when information from episodes. This directly addresses the main limitation of the popular low-level controller, Steve-1. Unlike previous models that rely on short-term memory, PEM organizes spatial and event-based data, enabling efficient recall and navigation in long-horizon tasks. Additionally, we propose an Exploration Strategy and a Memory-Augmented Task Solving Framework, allowing agents to alternate between exploration and task-solving based on recalled events. Our approach significantly improves task-solving and exploration efficiency compared to existing methods. We will release our code and demos on the project page: https://sites.google.com/view/mr-steve. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06736v2-abstract-full').style.display = 'none'; document.getElementById('2411.06736v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06367">arXiv:2411.06367</a> <span> [<a href="https://arxiv.org/pdf/2411.06367">pdf</a>, <a href="https://arxiv.org/format/2411.06367">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> </div> </div> <p class="title is-5 mathjax"> BayesNAM: Leveraging Inconsistency for Reliable Explanations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kim%2C+H">Hoki Kim</a>, <a href="/search/cs?searchtype=author&query=Park%2C+J">Jinseong Park</a>, <a href="/search/cs?searchtype=author&query=Choi%2C+Y">Yujin Choi</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+S">Seungyun Lee</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+J">Jaewook Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06367v1-abstract-short" style="display: inline;"> Neural additive model (NAM) is a recently proposed explainable artificial intelligence (XAI) method that utilizes neural network-based architectures. Given the advantages of neural networks, NAMs provide intuitive explanations for their predictions with high model performance. In this paper, we analyze a critical yet overlooked phenomenon: NAMs often produce inconsistent explanations, even when us… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06367v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06367v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06367v1-abstract-full" style="display: none;"> Neural additive model (NAM) is a recently proposed explainable artificial intelligence (XAI) method that utilizes neural network-based architectures. Given the advantages of neural networks, NAMs provide intuitive explanations for their predictions with high model performance. In this paper, we analyze a critical yet overlooked phenomenon: NAMs often produce inconsistent explanations, even when using the same architecture and dataset. Traditionally, such inconsistencies have been viewed as issues to be resolved. However, we argue instead that these inconsistencies can provide valuable explanations within the given data model. Through a simple theoretical framework, we demonstrate that these inconsistencies are not mere artifacts but emerge naturally in datasets with multiple important features. To effectively leverage this information, we introduce a novel framework, Bayesian Neural Additive Model (BayesNAM), which integrates Bayesian neural networks and feature dropout, with theoretical proof demonstrating that feature dropout effectively captures model inconsistencies. Our experiments demonstrate that BayesNAM effectively reveals potential problems such as insufficient data or structural limitations of the model, providing more reliable explanations and potential remedies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06367v1-abstract-full').style.display = 'none'; document.getElementById('2411.06367v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Under Review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06311">arXiv:2411.06311</a> <span> [<a href="https://arxiv.org/pdf/2411.06311">pdf</a>, <a href="https://arxiv.org/format/2411.06311">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Mathematical Physics">math-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Dynamical Systems">math.DS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Statistics Theory">math.ST</span> </div> </div> <p class="title is-5 mathjax"> When are dynamical systems learned from time series data statistically accurate? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Park%2C+J">Jeongjin Park</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+N">Nicole Yang</a>, <a href="/search/cs?searchtype=author&query=Chandramoorthy%2C+N">Nisha Chandramoorthy</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06311v1-abstract-short" style="display: inline;"> Conventional notions of generalization often fail to describe the ability of learned models to capture meaningful information from dynamical data. A neural network that learns complex dynamics with a small test error may still fail to reproduce its \emph{physical} behavior, including associated statistical moments and Lyapunov exponents. To address this gap, we propose an ergodic theoretic approac… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06311v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06311v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06311v1-abstract-full" style="display: none;"> Conventional notions of generalization often fail to describe the ability of learned models to capture meaningful information from dynamical data. A neural network that learns complex dynamics with a small test error may still fail to reproduce its \emph{physical} behavior, including associated statistical moments and Lyapunov exponents. To address this gap, we propose an ergodic theoretic approach to generalization of complex dynamical models learned from time series data. Our main contribution is to define and analyze generalization of a broad suite of neural representations of classes of ergodic systems, including chaotic systems, in a way that captures emulating underlying invariant, physical measures. Our results provide theoretical justification for why regression methods for generators of dynamical systems (Neural ODEs) fail to generalize, and why their statistical accuracy improves upon adding Jacobian information during training. We verify our results on a number of ergodic chaotic systems and neural network parameterizations, including MLPs, ResNets, Fourier Neural layers, and RNNs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06311v1-abstract-full').style.display = 'none'; document.getElementById('2411.06311v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">in NeuRIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05423">arXiv:2411.05423</a> <span> [<a href="https://arxiv.org/pdf/2411.05423">pdf</a>, <a href="https://arxiv.org/format/2411.05423">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> VISTA: Visual Integrated System for Tailored Automation in Math Problem Generation Using LLM </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lee%2C+J">Jeongwoo Lee</a>, <a href="/search/cs?searchtype=author&query=Park%2C+K">Kwangsuk Park</a>, <a href="/search/cs?searchtype=author&query=Park%2C+J">Jihyeon Park</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05423v1-abstract-short" style="display: inline;"> Generating accurate and consistent visual aids is a critical challenge in mathematics education, where visual representations like geometric shapes and functions play a pivotal role in enhancing student comprehension. This paper introduces a novel multi-agent framework that leverages Large Language Models (LLMs) to automate the creation of complex mathematical visualizations alongside coherent pro… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05423v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05423v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05423v1-abstract-full" style="display: none;"> Generating accurate and consistent visual aids is a critical challenge in mathematics education, where visual representations like geometric shapes and functions play a pivotal role in enhancing student comprehension. This paper introduces a novel multi-agent framework that leverages Large Language Models (LLMs) to automate the creation of complex mathematical visualizations alongside coherent problem text. Our approach not only simplifies the generation of precise visual aids but also aligns these aids with the problem's core mathematical concepts, improving both problem creation and assessment. By integrating multiple agents, each responsible for distinct tasks such as numeric calculation, geometry validation, and visualization, our system delivers mathematically accurate and contextually relevant problems with visual aids. Evaluation across Geometry and Function problem types shows that our method significantly outperforms basic LLMs in terms of text coherence, consistency, relevance and similarity, while maintaining the essential geometrical and functional integrity of the original problems. Although some challenges remain in ensuring consistent visual outputs, our framework demonstrates the immense potential of LLMs in transforming the way educators generate and utilize visual aids in math education. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05423v1-abstract-full').style.display = 'none'; document.getElementById('2411.05423v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at NeurIPS 2024 Workshop on Large Foundation Models for Educational Assessment (FM-Assess)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05330">arXiv:2411.05330</a> <span> [<a href="https://arxiv.org/pdf/2411.05330">pdf</a>, <a href="https://arxiv.org/format/2411.05330">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Inversion-based Latent Bayesian Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chu%2C+J">Jaewon Chu</a>, <a href="/search/cs?searchtype=author&query=Park%2C+J">Jinyoung Park</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+S">Seunghun Lee</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+H+J">Hyunwoo J. Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05330v1-abstract-short" style="display: inline;"> Latent Bayesian optimization (LBO) approaches have successfully adopted Bayesian optimization over a continuous latent space by employing an encoder-decoder architecture to address the challenge of optimization in a high dimensional or discrete input space. LBO learns a surrogate model to approximate the black-box objective function in the latent space. However, we observed that most LBO methods s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05330v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05330v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05330v1-abstract-full" style="display: none;"> Latent Bayesian optimization (LBO) approaches have successfully adopted Bayesian optimization over a continuous latent space by employing an encoder-decoder architecture to address the challenge of optimization in a high dimensional or discrete input space. LBO learns a surrogate model to approximate the black-box objective function in the latent space. However, we observed that most LBO methods suffer from the `misalignment problem`, which is induced by the reconstruction error of the encoder-decoder architecture. It hinders learning an accurate surrogate model and generating high-quality solutions. In addition, several trust region-based LBO methods select the anchor, the center of the trust region, based solely on the objective function value without considering the trust region`s potential to enhance the optimization process. To address these issues, we propose Inversion-based Latent Bayesian Optimization (InvBO), a plug-and-play module for LBO. InvBO consists of two components: an inversion method and a potential-aware trust region anchor selection. The inversion method searches the latent code that completely reconstructs the given target data. The potential-aware trust region anchor selection considers the potential capability of the trust region for better local optimization. Experimental results demonstrate the effectiveness of InvBO on nine real-world benchmarks, such as molecule design and arithmetic expression fitting tasks. Code is available at https://github.com/mlvlab/InvBO. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05330v1-abstract-full').style.display = 'none'; document.getElementById('2411.05330v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05254">arXiv:2411.05254</a> <span> [<a href="https://arxiv.org/pdf/2411.05254">pdf</a>, <a href="https://arxiv.org/format/2411.05254">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Hierarchical Visual Feature Aggregation for OCR-Free Document Understanding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Park%2C+J">Jaeyoo Park</a>, <a href="/search/cs?searchtype=author&query=Choi%2C+J+Y">Jin Young Choi</a>, <a href="/search/cs?searchtype=author&query=Park%2C+J">Jeonghyung Park</a>, <a href="/search/cs?searchtype=author&query=Han%2C+B">Bohyung Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05254v1-abstract-short" style="display: inline;"> We present a novel OCR-free document understanding framework based on pretrained Multimodal Large Language Models (MLLMs). Our approach employs multi-scale visual features to effectively handle various font sizes within document images. To address the increasing costs of considering the multi-scale visual inputs for MLLMs, we propose the Hierarchical Visual Feature Aggregation (HVFA) module, desig… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05254v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05254v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05254v1-abstract-full" style="display: none;"> We present a novel OCR-free document understanding framework based on pretrained Multimodal Large Language Models (MLLMs). Our approach employs multi-scale visual features to effectively handle various font sizes within document images. To address the increasing costs of considering the multi-scale visual inputs for MLLMs, we propose the Hierarchical Visual Feature Aggregation (HVFA) module, designed to reduce the number of input tokens to LLMs. Leveraging a feature pyramid with cross-attentive pooling, our approach effectively manages the trade-off between information loss and efficiency without being affected by varying document image sizes. Furthermore, we introduce a novel instruction tuning task, which facilitates the model's text-reading capability by learning to predict the relative positions of input text, eventually minimizing the risk of truncated text caused by the limited capacity of LLMs. Comprehensive experiments validate the effectiveness of our approach, demonstrating superior performance in various document understanding tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05254v1-abstract-full').style.display = 'none'; document.getElementById('2411.05254v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05001">arXiv:2411.05001</a> <span> [<a href="https://arxiv.org/pdf/2411.05001">pdf</a>, <a href="https://arxiv.org/format/2411.05001">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Analyzing The Language of Visual Tokens </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chan%2C+D+M">David M. Chan</a>, <a href="/search/cs?searchtype=author&query=Corona%2C+R">Rodolfo Corona</a>, <a href="/search/cs?searchtype=author&query=Park%2C+J">Joonyong Park</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+C+J">Cheol Jun Cho</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+Y">Yutong Bai</a>, <a href="/search/cs?searchtype=author&query=Darrell%2C+T">Trevor Darrell</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05001v1-abstract-short" style="display: inline;"> With the introduction of transformer-based models for vision and language tasks, such as LLaVA and Chameleon, there has been renewed interest in the discrete tokenized representation of images. These models often treat image patches as discrete tokens, analogous to words in natural language, learning joint alignments between visual and human languages. However, little is known about the statistica… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05001v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05001v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05001v1-abstract-full" style="display: none;"> With the introduction of transformer-based models for vision and language tasks, such as LLaVA and Chameleon, there has been renewed interest in the discrete tokenized representation of images. These models often treat image patches as discrete tokens, analogous to words in natural language, learning joint alignments between visual and human languages. However, little is known about the statistical behavior of these visual languages - whether they follow similar frequency distributions, grammatical structures, or topologies as natural languages. In this paper, we take a natural-language-centric approach to analyzing discrete visual languages and uncover striking similarities and fundamental differences. We demonstrate that, although visual languages adhere to Zipfian distributions, higher token innovation drives greater entropy and lower compression, with tokens predominantly representing object parts, indicating intermediate granularity. We also show that visual languages lack cohesive grammatical structures, leading to higher perplexity and weaker hierarchical organization compared to natural languages. Finally, we demonstrate that, while vision models align more closely with natural languages than other models, this alignment remains significantly weaker than the cohesion found within natural languages. Through these experiments, we demonstrate how understanding the statistical properties of discrete visual languages can inform the design of more effective computer vision models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05001v1-abstract-full').style.display = 'none'; document.getElementById('2411.05001v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04225">arXiv:2411.04225</a> <span> [<a href="https://arxiv.org/pdf/2411.04225">pdf</a>, <a href="https://arxiv.org/format/2411.04225">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Approximate Equivariance in Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Park%2C+J+Y">Jung Yeon Park</a>, <a href="/search/cs?searchtype=author&query=Bhatt%2C+S">Sujay Bhatt</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+S">Sihan Zeng</a>, <a href="/search/cs?searchtype=author&query=Wong%2C+L+L+S">Lawson L. S. Wong</a>, <a href="/search/cs?searchtype=author&query=Koppel%2C+A">Alec Koppel</a>, <a href="/search/cs?searchtype=author&query=Ganesh%2C+S">Sumitra Ganesh</a>, <a href="/search/cs?searchtype=author&query=Walters%2C+R">Robin Walters</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04225v1-abstract-short" style="display: inline;"> Equivariant neural networks have shown great success in reinforcement learning, improving sample efficiency and generalization when there is symmetry in the task. However, in many problems, only approximate symmetry is present, which makes imposing exact symmetry inappropriate. Recently, approximately equivariant networks have been proposed for supervised classification and modeling physical syste… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04225v1-abstract-full').style.display = 'inline'; document.getElementById('2411.04225v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04225v1-abstract-full" style="display: none;"> Equivariant neural networks have shown great success in reinforcement learning, improving sample efficiency and generalization when there is symmetry in the task. However, in many problems, only approximate symmetry is present, which makes imposing exact symmetry inappropriate. Recently, approximately equivariant networks have been proposed for supervised classification and modeling physical systems. In this work, we develop approximately equivariant algorithms in reinforcement learning (RL). We define approximately equivariant MDPs and theoretically characterize the effect of approximate equivariance on the optimal Q function. We propose novel RL architectures using relaxed group convolutions and experiment on several continuous control domains and stock trading with real financial data. Our results demonstrate that approximate equivariance matches prior work when exact symmetries are present, and outperforms them when domains exhibit approximate symmetry. As an added byproduct of these techniques, we observe increased robustness to noise at test time. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04225v1-abstract-full').style.display = 'none'; document.getElementById('2411.04225v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Preprint</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04125">arXiv:2411.04125</a> <span> [<a href="https://arxiv.org/pdf/2411.04125">pdf</a>, <a href="https://arxiv.org/format/2411.04125">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Community Forensics: Using Thousands of Generators to Train Fake Image Detectors </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Park%2C+J">Jeongsoo Park</a>, <a href="/search/cs?searchtype=author&query=Owens%2C+A">Andrew Owens</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04125v1-abstract-short" style="display: inline;"> One of the key challenges of detecting AI-generated images is spotting images that have been created by previously unseen generative models. We argue that the limited diversity of the training data is a major obstacle to addressing this problem, and we propose a new dataset that is significantly larger and more diverse than prior work. As part of creating this dataset, we systematically download t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04125v1-abstract-full').style.display = 'inline'; document.getElementById('2411.04125v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04125v1-abstract-full" style="display: none;"> One of the key challenges of detecting AI-generated images is spotting images that have been created by previously unseen generative models. We argue that the limited diversity of the training data is a major obstacle to addressing this problem, and we propose a new dataset that is significantly larger and more diverse than prior work. As part of creating this dataset, we systematically download thousands of text-to-image latent diffusion models and sample images from them. We also collect images from dozens of popular open source and commercial models. The resulting dataset contains 2.7M images that have been sampled from 4803 different models. These images collectively capture a wide range of scene content, generator architectures, and image processing settings. Using this dataset, we study the generalization abilities of fake image detectors. Our experiments suggest that detection performance improves as the number of models in the training set increases, even when these models have similar architectures. We also find that detection performance improves as the diversity of the models increases, and that our trained detectors generalize better than those trained on other datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04125v1-abstract-full').style.display = 'none'; document.getElementById('2411.04125v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03393">arXiv:2411.03393</a> <span> [<a href="https://arxiv.org/pdf/2411.03393">pdf</a>, <a href="https://arxiv.org/format/2411.03393">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Combinatorics">math.CO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Data Structures and Algorithms">cs.DS</span> </div> </div> <p class="title is-5 mathjax"> A refined graph container lemma and applications to the hard-core model on bipartite expanders </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jenssen%2C+M">Matthew Jenssen</a>, <a href="/search/cs?searchtype=author&query=Malekshahian%2C+A">Alexandru Malekshahian</a>, <a href="/search/cs?searchtype=author&query=Park%2C+J">Jinyoung Park</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03393v1-abstract-short" style="display: inline;"> We establish a refined version of a graph container lemma due to Galvin and discuss several applications related to the hard-core model on bipartite expander graphs. Given a graph $G$ and $位>0$, the hard-core model on $G$ at activity $位$ is the probability distribution $渭_{G,位}$ on independent sets in $G$ given by $渭_{G,位}(I)\propto 位^{|I|}$. As one of our main applications, we show that the hard-… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03393v1-abstract-full').style.display = 'inline'; document.getElementById('2411.03393v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03393v1-abstract-full" style="display: none;"> We establish a refined version of a graph container lemma due to Galvin and discuss several applications related to the hard-core model on bipartite expander graphs. Given a graph $G$ and $位>0$, the hard-core model on $G$ at activity $位$ is the probability distribution $渭_{G,位}$ on independent sets in $G$ given by $渭_{G,位}(I)\propto 位^{|I|}$. As one of our main applications, we show that the hard-core model at activity $位$ on the hypercube $Q_d$ exhibits a `structured phase' for $位= 惟( \log^2 d/d^{1/2})$ in the following sense: in a typical sample from $渭_{Q_d,位}$, most vertices are contained in one side of the bipartition of $Q_d$. This improves upon a result of Galvin which establishes the same for $位=惟(\log d/ d^{1/3})$. As another application, we establish a fully polynomial-time approximation scheme (FPTAS) for the hard-core model on a $d$-regular bipartite $伪$-expander, with $伪>0$ fixed, when $位= 惟( \log^2 d/d^{1/2})$. This improves upon the bound $位=惟(\log d/ d^{1/4})$ due to the first author, Perkins and Potukuchi. We discuss similar improvements to results of Galvin-Tetali, Balogh-Garcia-Li and Kronenberg-Spinka. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03393v1-abstract-full').style.display = 'none'; document.getElementById('2411.03393v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02814">arXiv:2411.02814</a> <span> [<a href="https://arxiv.org/pdf/2411.02814">pdf</a>, <a href="https://arxiv.org/format/2411.02814">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Performance">cs.PF</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Operating Systems">cs.OS</span> </div> </div> <p class="title is-5 mathjax"> The Hitchhiker's Guide to Programming and Optimizing CXL-Based Heterogeneous Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zixuan Wang</a>, <a href="/search/cs?searchtype=author&query=Mahar%2C+S">Suyash Mahar</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Luyi Li</a>, <a href="/search/cs?searchtype=author&query=Park%2C+J">Jangseon Park</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+J">Jinpyo Kim</a>, <a href="/search/cs?searchtype=author&query=Michailidis%2C+T">Theodore Michailidis</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+Y">Yue Pan</a>, <a href="/search/cs?searchtype=author&query=Rosing%2C+T">Tajana Rosing</a>, <a href="/search/cs?searchtype=author&query=Tullsen%2C+D">Dean Tullsen</a>, <a href="/search/cs?searchtype=author&query=Swanson%2C+S">Steven Swanson</a>, <a href="/search/cs?searchtype=author&query=Ryoo%2C+K+C">Kyung Chang Ryoo</a>, <a href="/search/cs?searchtype=author&query=Park%2C+S">Sungjoo Park</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+J">Jishen Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02814v1-abstract-short" style="display: inline;"> We present a thorough analysis of the use of CXL-based heterogeneous systems. We built a cluster of server systems that combines different vendor's CPUs and various types of CXL devices. We further developed a heterogeneous memory benchmark suite, Heimdall, to profile the performance of such heterogeneous systems. By leveraging Heimdall, we unveiled the detailed architecture design in these system… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02814v1-abstract-full').style.display = 'inline'; document.getElementById('2411.02814v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02814v1-abstract-full" style="display: none;"> We present a thorough analysis of the use of CXL-based heterogeneous systems. We built a cluster of server systems that combines different vendor's CPUs and various types of CXL devices. We further developed a heterogeneous memory benchmark suite, Heimdall, to profile the performance of such heterogeneous systems. By leveraging Heimdall, we unveiled the detailed architecture design in these systems, drew observations on optimizing performance for workloads, and pointed out directions for future development of CXL-based heterogeneous systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02814v1-abstract-full').style.display = 'none'; document.getElementById('2411.02814v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02403">arXiv:2411.02403</a> <span> [<a href="https://arxiv.org/pdf/2411.02403">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> A Persuasion-Based Prompt Learning Approach to Improve Smishing Detection through Data Augmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shim%2C+H+S">Ho Sung Shim</a>, <a href="/search/cs?searchtype=author&query=Park%2C+H">Hyoungjun Park</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+K">Kyuhan Lee</a>, <a href="/search/cs?searchtype=author&query=Park%2C+J">Jang-Sun Park</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+S">Seonhye Kang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02403v2-abstract-short" style="display: inline;"> Smishing, which aims to illicitly obtain personal information from unsuspecting victims, holds significance due to its negative impacts on our society. In prior studies, as a tool to counteract smishing, machine learning (ML) has been widely adopted, which filters and blocks smishing messages before they reach potential victims. However, a number of challenges remain in ML-based smishing detection… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02403v2-abstract-full').style.display = 'inline'; document.getElementById('2411.02403v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02403v2-abstract-full" style="display: none;"> Smishing, which aims to illicitly obtain personal information from unsuspecting victims, holds significance due to its negative impacts on our society. In prior studies, as a tool to counteract smishing, machine learning (ML) has been widely adopted, which filters and blocks smishing messages before they reach potential victims. However, a number of challenges remain in ML-based smishing detection, with the scarcity of annotated datasets being one major hurdle. Specifically, given the sensitive nature of smishing-related data, there is a lack of publicly accessible data that can be used for training and evaluating ML models. Additionally, the nuanced similarities between smishing messages and other types of social engineering attacks such as spam messages exacerbate the challenge of smishing classification with limited resources. To tackle this challenge, we introduce a novel data augmentation method utilizing a few-shot prompt learning approach. What sets our approach apart from extant methods is the use of the principles of persuasion, a psychology theory which explains the underlying mechanisms of smishing. By designing prompts grounded in the persuasion principles, our augmented dataset could effectively capture various, important aspects of smishing messages, enabling ML models to be effectively trained. Our evaluation within a real-world context demonstrates that our augmentation approach produces more diverse and higher-quality smishing data instances compared to other cutting-edging approaches, leading to substantial improvements in the ability of ML models to detect the subtle characteristics of smishing messages. Moreover, our additional analyses reveal that the performance improvement provided by our approach is more pronounced when used with ML models that have a larger number of parameters, demonstrating its effectiveness in training large-scale ML models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02403v2-abstract-full').style.display = 'none'; document.getElementById('2411.02403v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02114">arXiv:2411.02114</a> <span> [<a href="https://arxiv.org/pdf/2411.02114">pdf</a>, <a href="https://arxiv.org/format/2411.02114">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Semiparametric conformal prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Park%2C+J+W">Ji Won Park</a>, <a href="/search/cs?searchtype=author&query=Tibshirani%2C+R">Robert Tibshirani</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+K">Kyunghyun Cho</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02114v1-abstract-short" style="display: inline;"> Many risk-sensitive applications require well-calibrated prediction sets over multiple, potentially correlated target variables, for which the prediction algorithm may report correlated non-conformity scores. In this work, we treat the scores as random vectors and aim to construct the prediction set accounting for their joint correlation structure. Drawing from the rich literature on multivariate… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02114v1-abstract-full').style.display = 'inline'; document.getElementById('2411.02114v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02114v1-abstract-full" style="display: none;"> Many risk-sensitive applications require well-calibrated prediction sets over multiple, potentially correlated target variables, for which the prediction algorithm may report correlated non-conformity scores. In this work, we treat the scores as random vectors and aim to construct the prediction set accounting for their joint correlation structure. Drawing from the rich literature on multivariate quantiles and semiparametric statistics, we propose an algorithm to estimate the $1-伪$ quantile of the scores, where $伪$ is the user-specified miscoverage rate. In particular, we flexibly estimate the joint cumulative distribution function (CDF) of the scores using nonparametric vine copulas and improve the asymptotic efficiency of the quantile estimate using its influence function. The vine decomposition allows our method to scale well to a large number of targets. We report desired coverage and competitive efficiency on a range of real-world regression problems, including those with missing-at-random labels in the calibration set. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02114v1-abstract-full').style.display = 'none'; document.getElementById('2411.02114v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages (+12 appendix), 11 figures, submitted to AISTATS 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01783">arXiv:2411.01783</a> <span> [<a href="https://arxiv.org/pdf/2411.01783">pdf</a>, <a href="https://arxiv.org/format/2411.01783">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Context Parallelism for Scalable Million-Token Inference </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+A">Amy Yang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jingyi Yang</a>, <a href="/search/cs?searchtype=author&query=Ibrahim%2C+A">Aya Ibrahim</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+X">Xinfeng Xie</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+B">Bangsheng Tang</a>, <a href="/search/cs?searchtype=author&query=Sizov%2C+G">Grigory Sizov</a>, <a href="/search/cs?searchtype=author&query=Reizenstein%2C+J">Jeremy Reizenstein</a>, <a href="/search/cs?searchtype=author&query=Park%2C+J">Jongsoo Park</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jianyu Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01783v2-abstract-short" style="display: inline;"> We present context parallelism for long-context large language model inference, which achieves near-linear scaling for long-context prefill latency with up to 128 H100 GPUs across 16 nodes. Particularly, our method achieves 1M context prefill with Llama3 405B model in 77s (93% parallelization efficiency, 63% FLOPS utilization) and 128K context prefill in 3.8s. We develop two lossless exact ring at… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01783v2-abstract-full').style.display = 'inline'; document.getElementById('2411.01783v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01783v2-abstract-full" style="display: none;"> We present context parallelism for long-context large language model inference, which achieves near-linear scaling for long-context prefill latency with up to 128 H100 GPUs across 16 nodes. Particularly, our method achieves 1M context prefill with Llama3 405B model in 77s (93% parallelization efficiency, 63% FLOPS utilization) and 128K context prefill in 3.8s. We develop two lossless exact ring attention variants: pass-KV and pass-Q to cover a wide range of use cases with the state-of-the-art performance: full prefill, persistent KV prefill and decode. Benchmarks on H100 GPU hosts inter-connected with RDMA and TCP both show similar scalability for long-context prefill, demonstrating that our method scales well using common commercial data center with medium-to-low inter-host bandwidth. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01783v2-abstract-full').style.display = 'none'; document.getElementById('2411.01783v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00871">arXiv:2411.00871</a> <span> [<a href="https://arxiv.org/pdf/2411.00871">pdf</a>, <a href="https://arxiv.org/format/2411.00871">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Molecular Networks">q-bio.MN</span> </div> </div> <p class="title is-5 mathjax"> LLaMo: Large Language Model-based Molecular Graph Assistant </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Park%2C+J">Jinyoung Park</a>, <a href="/search/cs?searchtype=author&query=Bae%2C+M">Minseong Bae</a>, <a href="/search/cs?searchtype=author&query=Ko%2C+D">Dohwan Ko</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+H+J">Hyunwoo J. Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00871v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) have demonstrated remarkable generalization and instruction-following capabilities with instruction tuning. The advancements in LLMs and instruction tuning have led to the development of Large Vision-Language Models (LVLMs). However, the competency of the LLMs and instruction tuning have been less explored in the molecular domain. Thus, we propose LLaMo: Large Language… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00871v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00871v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00871v1-abstract-full" style="display: none;"> Large Language Models (LLMs) have demonstrated remarkable generalization and instruction-following capabilities with instruction tuning. The advancements in LLMs and instruction tuning have led to the development of Large Vision-Language Models (LVLMs). However, the competency of the LLMs and instruction tuning have been less explored in the molecular domain. Thus, we propose LLaMo: Large Language Model-based Molecular graph assistant, which is an end-to-end trained large molecular graph-language model. To bridge the discrepancy between the language and graph modalities, we present the multi-level graph projector that transforms graph representations into graph tokens by abstracting the output representations of each GNN layer and motif representations with the cross-attention mechanism. We also introduce machine-generated molecular graph instruction data to instruction-tune the large molecular graph-language model for general-purpose molecule and language understanding. Our extensive experiments demonstrate that LLaMo shows the best performance on diverse tasks, such as molecular description generation, property prediction, and IUPAC name prediction. The code of LLaMo is available at https://github.com/mlvlab/LLaMo. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00871v1-abstract-full').style.display = 'none'; document.getElementById('2411.00871v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00860">arXiv:2411.00860</a> <span> [<a href="https://arxiv.org/pdf/2411.00860">pdf</a>, <a href="https://arxiv.org/format/2411.00860">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Survey of Cultural Awareness in Language Models: Text and Beyond </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Pawar%2C+S">Siddhesh Pawar</a>, <a href="/search/cs?searchtype=author&query=Park%2C+J">Junyeong Park</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+J">Jiho Jin</a>, <a href="/search/cs?searchtype=author&query=Arora%2C+A">Arnav Arora</a>, <a href="/search/cs?searchtype=author&query=Myung%2C+J">Junho Myung</a>, <a href="/search/cs?searchtype=author&query=Yadav%2C+S">Srishti Yadav</a>, <a href="/search/cs?searchtype=author&query=Haznitrama%2C+F+G">Faiz Ghifari Haznitrama</a>, <a href="/search/cs?searchtype=author&query=Song%2C+I">Inhwa Song</a>, <a href="/search/cs?searchtype=author&query=Oh%2C+A">Alice Oh</a>, <a href="/search/cs?searchtype=author&query=Augenstein%2C+I">Isabelle Augenstein</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00860v1-abstract-short" style="display: inline;"> Large-scale deployment of large language models (LLMs) in various applications, such as chatbots and virtual assistants, requires LLMs to be culturally sensitive to the user to ensure inclusivity. Culture has been widely studied in psychology and anthropology, and there has been a recent surge in research on making LLMs more culturally inclusive in LLMs that goes beyond multilinguality and builds… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00860v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00860v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00860v1-abstract-full" style="display: none;"> Large-scale deployment of large language models (LLMs) in various applications, such as chatbots and virtual assistants, requires LLMs to be culturally sensitive to the user to ensure inclusivity. Culture has been widely studied in psychology and anthropology, and there has been a recent surge in research on making LLMs more culturally inclusive in LLMs that goes beyond multilinguality and builds on findings from psychology and anthropology. In this paper, we survey efforts towards incorporating cultural awareness into text-based and multimodal LLMs. We start by defining cultural awareness in LLMs, taking the definitions of culture from anthropology and psychology as a point of departure. We then examine methodologies adopted for creating cross-cultural datasets, strategies for cultural inclusion in downstream tasks, and methodologies that have been used for benchmarking cultural awareness in LLMs. Further, we discuss the ethical implications of cultural alignment, the role of Human-Computer Interaction in driving cultural inclusion in LLMs, and the role of cultural alignment in driving social science research. We finally provide pointers to future research based on our findings about gaps in the literature. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00860v1-abstract-full').style.display = 'none'; document.getElementById('2411.00860v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00404">arXiv:2411.00404</a> <span> [<a href="https://arxiv.org/pdf/2411.00404">pdf</a>, <a href="https://arxiv.org/format/2411.00404">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Fast Adaptation with Kernel and Gradient based Meta Leaning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Park%2C+J">JuneYoung Park</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+M">MinJae Kang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00404v1-abstract-short" style="display: inline;"> Model Agnostic Meta Learning or MAML has become the standard for few-shot learning as a meta-learning problem. MAML is simple and can be applied to any model, as its name suggests. However, it often suffers from instability and computational inefficiency during both training and inference times. In this paper, we propose two algorithms to improve both the inner and outer loops of MAML, then pose a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00404v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00404v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00404v1-abstract-full" style="display: none;"> Model Agnostic Meta Learning or MAML has become the standard for few-shot learning as a meta-learning problem. MAML is simple and can be applied to any model, as its name suggests. However, it often suffers from instability and computational inefficiency during both training and inference times. In this paper, we propose two algorithms to improve both the inner and outer loops of MAML, then pose an important question about what 'meta' learning truly is. Our first algorithm redefines the optimization problem in the function space to update the model using closed-form solutions instead of optimizing parameters through multiple gradient steps in the inner loop. In the outer loop, the second algorithm adjusts the learning of the meta-learner by assigning weights to the losses from each task of the inner loop. This method optimizes convergence during both the training and inference stages of MAML. In conclusion, our algorithms offer a new perspective on meta-learning and make significant discoveries in both theory and experiments. This research suggests a more efficient approach to few-shot learning and fast task adaptation compared to existing methods. Furthermore, it lays the foundation for establishing a new paradigm in meta-learning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00404v1-abstract-full').style.display = 'none'; document.getElementById('2411.00404v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages(with reference), 2 figures, 4 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00311">arXiv:2411.00311</a> <span> [<a href="https://arxiv.org/pdf/2411.00311">pdf</a>, <a href="https://arxiv.org/format/2411.00311">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> C2A: Client-Customized Adaptation for Parameter-Efficient Federated Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kim%2C+Y">Yeachan Kim</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+J">Junho Kim</a>, <a href="/search/cs?searchtype=author&query=Mok%2C+W">Wing-Lam Mok</a>, <a href="/search/cs?searchtype=author&query=Park%2C+J">Jun-Hyung Park</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+S">SangKeun Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00311v1-abstract-short" style="display: inline;"> Despite the versatility of pre-trained language models (PLMs) across domains, their large memory footprints pose significant challenges in federated learning (FL), where the training model has to be distributed between a server and clients. One potential solution to bypass such constraints might be the use of parameter-efficient fine-tuning (PEFT) in the context of FL. However, we have observed th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00311v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00311v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00311v1-abstract-full" style="display: none;"> Despite the versatility of pre-trained language models (PLMs) across domains, their large memory footprints pose significant challenges in federated learning (FL), where the training model has to be distributed between a server and clients. One potential solution to bypass such constraints might be the use of parameter-efficient fine-tuning (PEFT) in the context of FL. However, we have observed that typical PEFT tends to severely suffer from heterogeneity among clients in FL scenarios, resulting in unstable and slow convergence. In this paper, we propose Client-Customized Adaptation (C2A), a novel hypernetwork-based FL framework that generates client-specific adapters by conditioning the client information. With the effectiveness of the hypernetworks in generating customized weights through learning to adopt the different characteristics of inputs, C2A can maximize the utility of shared model parameters while minimizing the divergence caused by client heterogeneity. To verify the efficacy of C2A, we perform extensive evaluations on FL scenarios involving heterogeneity in label and language distributions. Comprehensive evaluation results clearly support the superiority of C2A in terms of both efficiency and effectiveness in FL scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00311v1-abstract-full').style.display = 'none'; document.getElementById('2411.00311v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published at Findings of ACL 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23820">arXiv:2410.23820</a> <span> [<a href="https://arxiv.org/pdf/2410.23820">pdf</a>, <a href="https://arxiv.org/format/2410.23820">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Disentangling Disentangled Representations: Towards Improved Latent Units via Diffusion Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jun%2C+Y">Youngjun Jun</a>, <a href="/search/cs?searchtype=author&query=Park%2C+J">Jiwoo Park</a>, <a href="/search/cs?searchtype=author&query=Choo%2C+K">Kyobin Choo</a>, <a href="/search/cs?searchtype=author&query=Choi%2C+T+E">Tae Eun Choi</a>, <a href="/search/cs?searchtype=author&query=Hwang%2C+S+J">Seong Jae Hwang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23820v1-abstract-short" style="display: inline;"> Disentangled representation learning (DRL) aims to break down observed data into core intrinsic factors for a profound understanding of the data. In real-world scenarios, manually defining and labeling these factors are non-trivial, making unsupervised methods attractive. Recently, there have been limited explorations of utilizing diffusion models (DMs), which are already mainstream in generative… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23820v1-abstract-full').style.display = 'inline'; document.getElementById('2410.23820v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23820v1-abstract-full" style="display: none;"> Disentangled representation learning (DRL) aims to break down observed data into core intrinsic factors for a profound understanding of the data. In real-world scenarios, manually defining and labeling these factors are non-trivial, making unsupervised methods attractive. Recently, there have been limited explorations of utilizing diffusion models (DMs), which are already mainstream in generative modeling, for unsupervised DRL. They implement their own inductive bias to ensure that each latent unit input to the DM expresses only one distinct factor. In this context, we design Dynamic Gaussian Anchoring to enforce attribute-separated latent units for more interpretable DRL. This unconventional inductive bias explicitly delineates the decision boundaries between attributes while also promoting the independence among latent units. Additionally, we also propose Skip Dropout technique, which easily modifies the denoising U-Net to be more DRL-friendly, addressing its uncooperative nature with the disentangling feature extractor. Our methods, which carefully consider the latent unit semantics and the distinct DM structure, enhance the practicality of DM-based disentangled representations, demonstrating state-of-the-art disentanglement performance on both synthetic and real data, as well as advantages in downstream tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23820v1-abstract-full').style.display = 'none'; document.getElementById('2410.23820v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23658">arXiv:2410.23658</a> <span> [<a href="https://arxiv.org/pdf/2410.23658">pdf</a>, <a href="https://arxiv.org/format/2410.23658">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> GS-Blur: A 3D Scene-Based Dataset for Realistic Image Deblurring </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lee%2C+D">Dongwoo Lee</a>, <a href="/search/cs?searchtype=author&query=Park%2C+J">Joonkyu Park</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+K+M">Kyoung Mu Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23658v1-abstract-short" style="display: inline;"> To train a deblurring network, an appropriate dataset with paired blurry and sharp images is essential. Existing datasets collect blurry images either synthetically by aggregating consecutive sharp frames or using sophisticated camera systems to capture real blur. However, these methods offer limited diversity in blur types (blur trajectories) or require extensive human effort to reconstruct large… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23658v1-abstract-full').style.display = 'inline'; document.getElementById('2410.23658v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23658v1-abstract-full" style="display: none;"> To train a deblurring network, an appropriate dataset with paired blurry and sharp images is essential. Existing datasets collect blurry images either synthetically by aggregating consecutive sharp frames or using sophisticated camera systems to capture real blur. However, these methods offer limited diversity in blur types (blur trajectories) or require extensive human effort to reconstruct large-scale datasets, failing to fully reflect real-world blur scenarios. To address this, we propose GS-Blur, a dataset of synthesized realistic blurry images created using a novel approach. To this end, we first reconstruct 3D scenes from multi-view images using 3D Gaussian Splatting (3DGS), then render blurry images by moving the camera view along the randomly generated motion trajectories. By adopting various camera trajectories in reconstructing our GS-Blur, our dataset contains realistic and diverse types of blur, offering a large-scale dataset that generalizes well to real-world blur. Using GS-Blur with various deblurring methods, we demonstrate its ability to generalize effectively compared to previous synthetic or real blur datasets, showing significant improvements in deblurring performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23658v1-abstract-full').style.display = 'none'; document.getElementById('2410.23658v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at NeurIPS 2024 Datasets & Benchmarks Track</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.22954">arXiv:2410.22954</a> <span> [<a href="https://arxiv.org/pdf/2410.22954">pdf</a>, <a href="https://arxiv.org/format/2410.22954">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Retrieval-Augmented Generation with Estimation of Source Reliability </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hwang%2C+J">Jeongyeon Hwang</a>, <a href="/search/cs?searchtype=author&query=Park%2C+J">Junyoung Park</a>, <a href="/search/cs?searchtype=author&query=Park%2C+H">Hyejin Park</a>, <a href="/search/cs?searchtype=author&query=Park%2C+S">Sangdon Park</a>, <a href="/search/cs?searchtype=author&query=Ok%2C+J">Jungseul Ok</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.22954v1-abstract-short" style="display: inline;"> Retrieval-augmented generation (RAG) addresses key limitations of large language models (LLMs), such as hallucinations and outdated knowledge, by incorporating external databases. These databases typically consult multiple sources to encompass up-to-date and various information. However, standard RAG methods often overlook the heterogeneous source reliability in the multi-source database and retri… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22954v1-abstract-full').style.display = 'inline'; document.getElementById('2410.22954v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.22954v1-abstract-full" style="display: none;"> Retrieval-augmented generation (RAG) addresses key limitations of large language models (LLMs), such as hallucinations and outdated knowledge, by incorporating external databases. These databases typically consult multiple sources to encompass up-to-date and various information. However, standard RAG methods often overlook the heterogeneous source reliability in the multi-source database and retrieve documents solely based on relevance, making them prone to propagating misinformation. To address this, we propose Reliability-Aware RAG (RA-RAG) which estimates the reliability of multiple sources and incorporates this information into both retrieval and aggregation processes. Specifically, it iteratively estimates source reliability and true answers for a set of queries with no labelling. Then, it selectively retrieves relevant documents from a few of reliable sources and aggregates them using weighted majority voting, where the selective retrieval ensures scalability while not compromising the performance. We also introduce a benchmark designed to reflect real-world scenarios with heterogeneous source reliability and demonstrate the effectiveness of RA-RAG compared to a set of baselines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22954v1-abstract-full').style.display = 'none'; document.getElementById('2410.22954v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.21826">arXiv:2410.21826</a> <span> [<a href="https://arxiv.org/pdf/2410.21826">pdf</a>, <a href="https://arxiv.org/format/2410.21826">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Volumetric Conditioning Module to Control Pretrained Diffusion Models for 3D Medical Images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ahn%2C+S">Suhyun Ahn</a>, <a href="/search/cs?searchtype=author&query=Park%2C+W">Wonjung Park</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+J">Jihoon Cho</a>, <a href="/search/cs?searchtype=author&query=Park%2C+S">Seunghyuck Park</a>, <a href="/search/cs?searchtype=author&query=Park%2C+J">Jinah Park</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.21826v1-abstract-short" style="display: inline;"> Spatial control methods using additional modules on pretrained diffusion models have gained attention for enabling conditional generation in natural images. These methods guide the generation process with new conditions while leveraging the capabilities of large models. They could be beneficial as training strategies in the context of 3D medical imaging, where training a diffusion model from scrat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21826v1-abstract-full').style.display = 'inline'; document.getElementById('2410.21826v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.21826v1-abstract-full" style="display: none;"> Spatial control methods using additional modules on pretrained diffusion models have gained attention for enabling conditional generation in natural images. These methods guide the generation process with new conditions while leveraging the capabilities of large models. They could be beneficial as training strategies in the context of 3D medical imaging, where training a diffusion model from scratch is challenging due to high computational costs and data scarcity. However, the potential application of spatial control methods with additional modules to 3D medical images has not yet been explored. In this paper, we present a tailored spatial control method for 3D medical images with a novel lightweight module, Volumetric Conditioning Module (VCM). Our VCM employs an asymmetric U-Net architecture to effectively encode complex information from various levels of 3D conditions, providing detailed guidance in image synthesis. To examine the applicability of spatial control methods and the effectiveness of VCM for 3D medical data, we conduct experiments under single- and multimodal conditions scenarios across a wide range of dataset sizes, from extremely small datasets with 10 samples to large datasets with 500 samples. The experimental results show that the VCM is effective for conditional generation and efficient in terms of requiring less training data and computational resources. We further investigate the potential applications for our spatial control method through axial super-resolution for medical images. Our code is available at \url{https://github.com/Ahn-Ssu/VCM} <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21826v1-abstract-full').style.display = 'none'; document.getElementById('2410.21826v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages, 18 figures, accepted @ WACV 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.21760">arXiv:2410.21760</a> <span> [<a href="https://arxiv.org/pdf/2410.21760">pdf</a>, <a href="https://arxiv.org/format/2410.21760">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> </div> </div> <p class="title is-5 mathjax"> A Host-SSD Collaborative Write Accelerator for LSM-Tree-Based Key-Value Stores </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kim%2C+K">KiHwan Kim</a>, <a href="/search/cs?searchtype=author&query=Chung%2C+H">Hyunsun Chung</a>, <a href="/search/cs?searchtype=author&query=Ahn%2C+S">Seonghoon Ahn</a>, <a href="/search/cs?searchtype=author&query=Park%2C+J">Junhyeok Park</a>, <a href="/search/cs?searchtype=author&query=Jamil%2C+S">Safdar Jamil</a>, <a href="/search/cs?searchtype=author&query=Byun%2C+H">Hongsu Byun</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+M">Myungcheol Lee</a>, <a href="/search/cs?searchtype=author&query=Choi%2C+J">Jinchun Choi</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+Y">Youngjae Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.21760v1-abstract-short" style="display: inline;"> Log-Structured Merge (LSM) tree-based Key-Value Stores (KVSs) are widely adopted for their high performance in write-intensive environments, but they often face performance degradation due to write stalls during compaction. Prior solutions, such as regulating I/O traffic or using multiple compaction threads, can cause unexpected drops in throughput or increase host CPU usage, while hardware-based… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21760v1-abstract-full').style.display = 'inline'; document.getElementById('2410.21760v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.21760v1-abstract-full" style="display: none;"> Log-Structured Merge (LSM) tree-based Key-Value Stores (KVSs) are widely adopted for their high performance in write-intensive environments, but they often face performance degradation due to write stalls during compaction. Prior solutions, such as regulating I/O traffic or using multiple compaction threads, can cause unexpected drops in throughput or increase host CPU usage, while hardware-based approaches using FPGA, GPU, and DPU aimed at reducing compaction duration introduce additional hardware costs. In this study, we propose KVACCEL, a novel hardware-software co-design framework that eliminates write stalls by leveraging a dual-interface SSD. KVACCEL allocates logical NAND flash space to support both block and key-value interfaces, using the key-value interface as a temporary write buffer during write stalls. This strategy significantly reduces write stalls, optimizes resource usage, and ensures consistency between the host and device by implementing an in-device LSM-based write buffer with an iterator-based range scan mechanism. Our extensive evaluation shows that for write-intensive workloads, KVACCEL outperforms ADOC by up to 1.17x in terms of throughput and performance-to-CPU-utilization efficiency. For mixed read-write workloads, both demonstrate comparable performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21760v1-abstract-full').style.display = 'none'; document.getElementById('2410.21760v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 14 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.21276">arXiv:2410.21276</a> <span> [<a href="https://arxiv.org/pdf/2410.21276">pdf</a>, <a href="https://arxiv.org/format/2410.21276">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> GPT-4o System Card </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=OpenAI"> OpenAI</a>, <a href="/search/cs?searchtype=author&query=%3A"> :</a>, <a href="/search/cs?searchtype=author&query=Hurst%2C+A">Aaron Hurst</a>, <a href="/search/cs?searchtype=author&query=Lerer%2C+A">Adam Lerer</a>, <a href="/search/cs?searchtype=author&query=Goucher%2C+A+P">Adam P. Goucher</a>, <a href="/search/cs?searchtype=author&query=Perelman%2C+A">Adam Perelman</a>, <a href="/search/cs?searchtype=author&query=Ramesh%2C+A">Aditya Ramesh</a>, <a href="/search/cs?searchtype=author&query=Clark%2C+A">Aidan Clark</a>, <a href="/search/cs?searchtype=author&query=Ostrow%2C+A">AJ Ostrow</a>, <a href="/search/cs?searchtype=author&query=Welihinda%2C+A">Akila Welihinda</a>, <a href="/search/cs?searchtype=author&query=Hayes%2C+A">Alan Hayes</a>, <a href="/search/cs?searchtype=author&query=Radford%2C+A">Alec Radford</a>, <a href="/search/cs?searchtype=author&query=M%C4%85dry%2C+A">Aleksander M膮dry</a>, <a href="/search/cs?searchtype=author&query=Baker-Whitcomb%2C+A">Alex Baker-Whitcomb</a>, <a href="/search/cs?searchtype=author&query=Beutel%2C+A">Alex Beutel</a>, <a href="/search/cs?searchtype=author&query=Borzunov%2C+A">Alex Borzunov</a>, <a href="/search/cs?searchtype=author&query=Carney%2C+A">Alex Carney</a>, <a href="/search/cs?searchtype=author&query=Chow%2C+A">Alex Chow</a>, <a href="/search/cs?searchtype=author&query=Kirillov%2C+A">Alex Kirillov</a>, <a href="/search/cs?searchtype=author&query=Nichol%2C+A">Alex Nichol</a>, <a href="/search/cs?searchtype=author&query=Paino%2C+A">Alex Paino</a>, <a href="/search/cs?searchtype=author&query=Renzin%2C+A">Alex Renzin</a>, <a href="/search/cs?searchtype=author&query=Passos%2C+A+T">Alex Tachard Passos</a>, <a href="/search/cs?searchtype=author&query=Kirillov%2C+A">Alexander Kirillov</a>, <a href="/search/cs?searchtype=author&query=Christakis%2C+A">Alexi Christakis</a> , et al. (395 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.21276v1-abstract-short" style="display: inline;"> GPT-4o is an autoregressive omni model that accepts as input any combination of text, audio, image, and video, and generates any combination of text, audio, and image outputs. It's trained end-to-end across text, vision, and audio, meaning all inputs and outputs are processed by the same neural network. GPT-4o can respond to audio inputs in as little as 232 milliseconds, with an average of 320 mil… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21276v1-abstract-full').style.display = 'inline'; document.getElementById('2410.21276v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.21276v1-abstract-full" style="display: none;"> GPT-4o is an autoregressive omni model that accepts as input any combination of text, audio, image, and video, and generates any combination of text, audio, and image outputs. It's trained end-to-end across text, vision, and audio, meaning all inputs and outputs are processed by the same neural network. GPT-4o can respond to audio inputs in as little as 232 milliseconds, with an average of 320 milliseconds, which is similar to human response time in conversation. It matches GPT-4 Turbo performance on text in English and code, with significant improvement on text in non-English languages, while also being much faster and 50\% cheaper in the API. GPT-4o is especially better at vision and audio understanding compared to existing models. In line with our commitment to building AI safely and consistent with our voluntary commitments to the White House, we are sharing the GPT-4o System Card, which includes our Preparedness Framework evaluations. In this System Card, we provide a detailed look at GPT-4o's capabilities, limitations, and safety evaluations across multiple categories, focusing on speech-to-speech while also evaluating text and image capabilities, and measures we've implemented to ensure the model is safe and aligned. We also include third-party assessments on dangerous capabilities, as well as discussion of potential societal impacts of GPT-4o's text and vision capabilities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21276v1-abstract-full').style.display = 'none'; document.getElementById('2410.21276v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20774">arXiv:2410.20774</a> <span> [<a href="https://arxiv.org/pdf/2410.20774">pdf</a>, <a href="https://arxiv.org/format/2410.20774">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Are LLM-Judges Robust to Expressions of Uncertainty? Investigating the effect of Epistemic Markers on LLM-based Evaluation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lee%2C+D">Dongryeol Lee</a>, <a href="/search/cs?searchtype=author&query=Hwang%2C+Y">Yerin Hwang</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+Y">Yongil Kim</a>, <a href="/search/cs?searchtype=author&query=Park%2C+J">Joonsuk Park</a>, <a href="/search/cs?searchtype=author&query=Jung%2C+K">Kyomin Jung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20774v1-abstract-short" style="display: inline;"> In line with the principle of honesty, there has been a growing effort to train large language models (LLMs) to generate outputs containing epistemic markers. However, evaluation in the presence of epistemic markers has been largely overlooked, raising a critical question: Could the use of epistemic markers in LLM-generated outputs lead to unintended negative consequences? To address this, we pres… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20774v1-abstract-full').style.display = 'inline'; document.getElementById('2410.20774v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20774v1-abstract-full" style="display: none;"> In line with the principle of honesty, there has been a growing effort to train large language models (LLMs) to generate outputs containing epistemic markers. However, evaluation in the presence of epistemic markers has been largely overlooked, raising a critical question: Could the use of epistemic markers in LLM-generated outputs lead to unintended negative consequences? To address this, we present EMBER, a benchmark designed to assess the robustness of LLM-judges to epistemic markers in both single and pairwise evaluation settings. Our findings, based on evaluations using EMBER, reveal that all tested LLM-judges, including GPT-4o, show a notable lack of robustness in the presence of epistemic markers. Specifically, we observe a negative bias toward epistemic markers, with a stronger bias against markers expressing uncertainty. This suggests that LLM-judges are influenced by the presence of these markers and do not focus solely on the correctness of the content. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20774v1-abstract-full').style.display = 'none'; document.getElementById('2410.20774v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">21 pages, 6 figures, 15 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20255">arXiv:2410.20255</a> <span> [<a href="https://arxiv.org/pdf/2410.20255">pdf</a>, <a href="https://arxiv.org/format/2410.20255">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Chemical Physics">physics.chem-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Biomolecules">q-bio.BM</span> </div> </div> <p class="title is-5 mathjax"> Equivariant Blurring Diffusion for Hierarchical Molecular Conformer Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Park%2C+J">Jiwoong Park</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+Y">Yang Shen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20255v1-abstract-short" style="display: inline;"> How can diffusion models process 3D geometries in a coarse-to-fine manner, akin to our multiscale view of the world? In this paper, we address the question by focusing on a fundamental biochemical problem of generating 3D molecular conformers conditioned on molecular graphs in a multiscale manner. Our approach consists of two hierarchical stages: i) generation of coarse-grained fragment-level 3D s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20255v1-abstract-full').style.display = 'inline'; document.getElementById('2410.20255v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20255v1-abstract-full" style="display: none;"> How can diffusion models process 3D geometries in a coarse-to-fine manner, akin to our multiscale view of the world? In this paper, we address the question by focusing on a fundamental biochemical problem of generating 3D molecular conformers conditioned on molecular graphs in a multiscale manner. Our approach consists of two hierarchical stages: i) generation of coarse-grained fragment-level 3D structure from the molecular graph, and ii) generation of fine atomic details from the coarse-grained approximated structure while allowing the latter to be adjusted simultaneously. For the challenging second stage, which demands preserving coarse-grained information while ensuring SE(3) equivariance, we introduce a novel generative model termed Equivariant Blurring Diffusion (EBD), which defines a forward process that moves towards the fragment-level coarse-grained structure by blurring the fine atomic details of conformers, and a reverse process that performs the opposite operation using equivariant networks. We demonstrate the effectiveness of EBD by geometric and chemical comparison to state-of-the-art denoising diffusion models on a benchmark of drug-like molecules. Ablation studies draw insights on the design of EBD by thoroughly analyzing its architecture, which includes the design of the loss function and the data corruption process. Codes are released at https://github.com/Shen-Lab/EBD . <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20255v1-abstract-full').style.display = 'none'; document.getElementById('2410.20255v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18352">arXiv:2410.18352</a> <span> [<a href="https://arxiv.org/pdf/2410.18352">pdf</a>, <a href="https://arxiv.org/format/2410.18352">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> FedBaF: Federated Learning Aggregation Biased by a Foundation Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Park%2C+J">Jong-Ik Park</a>, <a href="/search/cs?searchtype=author&query=Pranav%2C+S">Srinivasa Pranav</a>, <a href="/search/cs?searchtype=author&query=Moura%2C+J+M+F">Jos茅 M. F. Moura</a>, <a href="/search/cs?searchtype=author&query=Joe-Wong%2C+C">Carlee Joe-Wong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18352v1-abstract-short" style="display: inline;"> Foundation models are now a major focus of leading technology organizations due to their ability to generalize across diverse tasks. Existing approaches for adapting foundation models to new applications often rely on Federated Learning (FL) and disclose the foundation model weights to clients when using it to initialize the global model. While these methods ensure client data privacy, they compro… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18352v1-abstract-full').style.display = 'inline'; document.getElementById('2410.18352v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18352v1-abstract-full" style="display: none;"> Foundation models are now a major focus of leading technology organizations due to their ability to generalize across diverse tasks. Existing approaches for adapting foundation models to new applications often rely on Federated Learning (FL) and disclose the foundation model weights to clients when using it to initialize the global model. While these methods ensure client data privacy, they compromise model and information security. In this paper, we introduce Federated Learning Aggregation Biased by a Foundation Model (FedBaF), a novel method for dynamically integrating pre-trained foundation model weights during the FL aggregation phase. Unlike conventional methods, FedBaF preserves the confidentiality of the foundation model while still leveraging its power to train more accurate models, especially in non-IID and adversarial scenarios. Our comprehensive experiments use Pre-ResNet and foundation models like Vision Transformer to demonstrate that FedBaF not only matches, but often surpasses the test accuracy of traditional weight initialization methods by up to 11.4\% in IID and up to 15.8\% in non-IID settings. Additionally, FedBaF applied to a Transformer-based language model significantly reduced perplexity by up to 39.2\%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18352v1-abstract-full').style.display = 'none'; document.getElementById('2410.18352v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18001">arXiv:2410.18001</a> <span> [<a href="https://arxiv.org/pdf/2410.18001">pdf</a>, <a href="https://arxiv.org/format/2410.18001">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Benchmarking Foundation Models on Exceptional Cases: Dataset Creation and Validation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kang%2C+S">Suho Kang</a>, <a href="/search/cs?searchtype=author&query=Park%2C+J">Jungyang Park</a>, <a href="/search/cs?searchtype=author&query=Ha%2C+J">Joonseo Ha</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+S">SoMin Kim</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+J">JinHyeong Kim</a>, <a href="/search/cs?searchtype=author&query=Park%2C+S">Subeen Park</a>, <a href="/search/cs?searchtype=author&query=Song%2C+K">Kyungwoo Song</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18001v1-abstract-short" style="display: inline;"> Foundation models (FMs) have achieved significant success across various tasks, leading to research on benchmarks for reasoning abilities. However, there is a lack of studies on FMs performance in exceptional scenarios, which we define as out-of-distribution (OOD) reasoning tasks. This paper is the first to address these cases, developing a novel dataset for evaluation of FMs across multiple modal… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18001v1-abstract-full').style.display = 'inline'; document.getElementById('2410.18001v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18001v1-abstract-full" style="display: none;"> Foundation models (FMs) have achieved significant success across various tasks, leading to research on benchmarks for reasoning abilities. However, there is a lack of studies on FMs performance in exceptional scenarios, which we define as out-of-distribution (OOD) reasoning tasks. This paper is the first to address these cases, developing a novel dataset for evaluation of FMs across multiple modalities, including graphic novels, calligraphy, news articles, and lyrics. It includes tasks for instance classification, character recognition, token prediction, and text generation. The paper also proposes prompt engineering techniques like Chain-of-Thought (CoT) and CoT+Few-Shot to enhance performance. Validation of FMs using various methods revealed improvements. The code repository is accessible at: https://github.com/MLAI-Yonsei/ExceptionalBenchmark <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18001v1-abstract-full').style.display = 'none'; document.getElementById('2410.18001v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">EMNLP 2024 Workshop Genbench(https://genbench.org/workshop_programme/)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.17421">arXiv:2410.17421</a> <span> [<a href="https://arxiv.org/pdf/2410.17421">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> From an attention economy to an ecology of attending. A manifesto </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bombaerts%2C+G">Gunter Bombaerts</a>, <a href="/search/cs?searchtype=author&query=Hannes%2C+T">Tom Hannes</a>, <a href="/search/cs?searchtype=author&query=Adam%2C+M">Martin Adam</a>, <a href="/search/cs?searchtype=author&query=Aloisi%2C+A">Alessandra Aloisi</a>, <a href="/search/cs?searchtype=author&query=Anderson%2C+J">Joel Anderson</a>, <a href="/search/cs?searchtype=author&query=Berger%2C+L">Lawrence Berger</a>, <a href="/search/cs?searchtype=author&query=Bettera%2C+S+D">Stefano Davide Bettera</a>, <a href="/search/cs?searchtype=author&query=Campo%2C+E">Enrico Campo</a>, <a href="/search/cs?searchtype=author&query=Candiotto%2C+L">Laura Candiotto</a>, <a href="/search/cs?searchtype=author&query=Panizza%2C+S+C">Silvia Caprioglio Panizza</a>, <a href="/search/cs?searchtype=author&query=Citton%2C+Y">Yves Citton</a>, <a href="/search/cs?searchtype=author&query=D%C3%A2%C2%80%C2%99Angelo%2C+D">Diego D芒聙聶Angelo</a>, <a href="/search/cs?searchtype=author&query=Dennis%2C+M">Matthew Dennis</a>, <a href="/search/cs?searchtype=author&query=Depraz%2C+N">Nathalie Depraz</a>, <a href="/search/cs?searchtype=author&query=Doran%2C+P">Peter Doran</a>, <a href="/search/cs?searchtype=author&query=Drechsler%2C+W">Wolfgang Drechsler</a>, <a href="/search/cs?searchtype=author&query=Duane%2C+B">Bill Duane</a>, <a href="/search/cs?searchtype=author&query=Edelglass%2C+W">William Edelglass</a>, <a href="/search/cs?searchtype=author&query=Eisenberger%2C+I">Iris Eisenberger</a>, <a href="/search/cs?searchtype=author&query=McGuire%2C+B+F">Beverley Foulks McGuire</a>, <a href="/search/cs?searchtype=author&query=Fredriksson%2C+A">Antony Fredriksson</a>, <a href="/search/cs?searchtype=author&query=Gill%2C+K+S">Karamjit S. Gill</a>, <a href="/search/cs?searchtype=author&query=Hershock%2C+P+D">Peter D. Hershock</a>, <a href="/search/cs?searchtype=author&query=Hongladarom%2C+S">Soraj Hongladarom</a>, <a href="/search/cs?searchtype=author&query=Jacobs%2C+B">Beth Jacobs</a> , et al. (30 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.17421v1-abstract-short" style="display: inline;"> As the signatories of this manifesto, we denounce the attention economy as inhumane and a threat to our sociopolitical and ecological well-being. We endorse policymakers' efforts to address the negative consequences of the attention economy's technology, but add that these approaches are often limited in their criticism of the systemic context of human attention. Starting from Buddhist philosophy,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17421v1-abstract-full').style.display = 'inline'; document.getElementById('2410.17421v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.17421v1-abstract-full" style="display: none;"> As the signatories of this manifesto, we denounce the attention economy as inhumane and a threat to our sociopolitical and ecological well-being. We endorse policymakers' efforts to address the negative consequences of the attention economy's technology, but add that these approaches are often limited in their criticism of the systemic context of human attention. Starting from Buddhist philosophy, we advocate a broader approach: an ecology of attending, that centers on conceptualizing, designing, and using attention (1) in an embedded way and (2) focused on the alleviating of suffering. With 'embedded' we mean that attention is not a neutral, isolated mechanism but a meaning-engendering part of an 'ecology' of bodily, sociotechnical and moral frameworks. With 'focused on the alleviation of suffering' we explicitly move away from the (often implicit) conception of attention as a tool for gratifying desires. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17421v1-abstract-full').style.display = 'none'; document.getElementById('2410.17421v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">21 pages, 1 figure</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.17270">arXiv:2410.17270</a> <span> [<a href="https://arxiv.org/pdf/2410.17270">pdf</a>, <a href="https://arxiv.org/format/2410.17270">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Biomolecules">q-bio.BM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Materials Science">cond-mat.mtrl-sci</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> MOFFlow: Flow Matching for Structure Prediction of Metal-Organic Frameworks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kim%2C+N">Nayoung Kim</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+S">Seongsu Kim</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+M">Minsu Kim</a>, <a href="/search/cs?searchtype=author&query=Park%2C+J">Jinkyoo Park</a>, <a href="/search/cs?searchtype=author&query=Ahn%2C+S">Sungsoo Ahn</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.17270v1-abstract-short" style="display: inline;"> Metal-organic frameworks (MOFs) are a class of crystalline materials with promising applications in many areas such as carbon capture and drug delivery. In this work, we introduce MOFFlow, the first deep generative model tailored for MOF structure prediction. Existing approaches, including ab initio calculations and even deep generative models, struggle with the complexity of MOF structures due to… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17270v1-abstract-full').style.display = 'inline'; document.getElementById('2410.17270v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.17270v1-abstract-full" style="display: none;"> Metal-organic frameworks (MOFs) are a class of crystalline materials with promising applications in many areas such as carbon capture and drug delivery. In this work, we introduce MOFFlow, the first deep generative model tailored for MOF structure prediction. Existing approaches, including ab initio calculations and even deep generative models, struggle with the complexity of MOF structures due to the large number of atoms in the unit cells. To address this limitation, we propose a novel Riemannian flow matching framework that reduces the dimensionality of the problem by treating the metal nodes and organic linkers as rigid bodies, capitalizing on the inherent modularity of MOFs. By operating in the $SE(3)$ space, MOFFlow effectively captures the roto-translational dynamics of these rigid components in a scalable way. Our experiment demonstrates that MOFFlow accurately predicts MOF structures containing several hundred atoms, significantly outperforming conventional methods and state-of-the-art machine learning baselines while being much faster. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17270v1-abstract-full').style.display = 'none'; document.getElementById('2410.17270v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.16647">arXiv:2410.16647</a> <span> [<a href="https://arxiv.org/pdf/2410.16647">pdf</a>, <a href="https://arxiv.org/format/2410.16647">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> GE2E-KWS: Generalized End-to-End Training and Evaluation for Zero-shot Keyword Spotting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+P">Pai Zhu</a>, <a href="/search/cs?searchtype=author&query=Bartel%2C+J+W">Jacob W. Bartel</a>, <a href="/search/cs?searchtype=author&query=Agarwal%2C+D">Dhruuv Agarwal</a>, <a href="/search/cs?searchtype=author&query=Partridge%2C+K">Kurt Partridge</a>, <a href="/search/cs?searchtype=author&query=Park%2C+H+J">Hyun Jin Park</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Quan Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.16647v1-abstract-short" style="display: inline;"> We propose GE2E-KWS -- a generalized end-to-end training and evaluation framework for customized keyword spotting. Specifically, enrollment utterances are separated and grouped by keywords from the training batch and their embedding centroids are compared to all other test utterance embeddings to compute the loss. This simulates runtime enrollment and verification stages, and improves convergence… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16647v1-abstract-full').style.display = 'inline'; document.getElementById('2410.16647v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.16647v1-abstract-full" style="display: none;"> We propose GE2E-KWS -- a generalized end-to-end training and evaluation framework for customized keyword spotting. Specifically, enrollment utterances are separated and grouped by keywords from the training batch and their embedding centroids are compared to all other test utterance embeddings to compute the loss. This simulates runtime enrollment and verification stages, and improves convergence stability and training speed by optimizing matrix operations compared to SOTA triplet loss approaches. To benchmark different models reliably, we propose an evaluation process that mimics the production environment and compute metrics that directly measure keyword matching accuracy. Trained with GE2E loss, our 419KB quantized conformer model beats a 7.5GB ASR encoder by 23.6% relative AUC, and beats a same size triplet loss model by 60.7% AUC. Our KWS models are natively streamable with low memory footprints, and designed to continuously run on-device with no retraining needed for new keywords (zero-shot). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16647v1-abstract-full').style.display = 'none'; document.getElementById('2410.16647v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 6 figures, 2 tables The paper is accepted in IEEE Spoken Language Technology (SLT) 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.16507">arXiv:2410.16507</a> <span> [<a href="https://arxiv.org/pdf/2410.16507">pdf</a>, <a href="https://arxiv.org/format/2410.16507">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> How the Internet Facilitates Adverse Childhood Experiences for Youth Who Self-Identify as in Need of Services </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Oguine%2C+O+C">Ozioma C. Oguine</a>, <a href="/search/cs?searchtype=author&query=Park%2C+J+K">Jinkyung Katie Park</a>, <a href="/search/cs?searchtype=author&query=Akter%2C+M">Mamtaj Akter</a>, <a href="/search/cs?searchtype=author&query=Olesk%2C+J">Johanna Olesk</a>, <a href="/search/cs?searchtype=author&query=Alluhidan%2C+A">Abdulmalik Alluhidan</a>, <a href="/search/cs?searchtype=author&query=Wisniewski%2C+P">Pamela Wisniewski</a>, <a href="/search/cs?searchtype=author&query=Badillo-Urquiola%2C+K">Karla Badillo-Urquiola</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.16507v1-abstract-short" style="display: inline;"> Youth implicated in the child welfare and juvenile justice systems, as well as those with an incarcerated parent, are considered the most vulnerable Children in Need of Services (CHINS). We identified 1,160 of these at-risk youth (ages 13-17) who sought support via an online peer support platform to understand their adverse childhood experiences and explore how the internet played a role in provid… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16507v1-abstract-full').style.display = 'inline'; document.getElementById('2410.16507v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.16507v1-abstract-full" style="display: none;"> Youth implicated in the child welfare and juvenile justice systems, as well as those with an incarcerated parent, are considered the most vulnerable Children in Need of Services (CHINS). We identified 1,160 of these at-risk youth (ages 13-17) who sought support via an online peer support platform to understand their adverse childhood experiences and explore how the internet played a role in providing an outlet for support, as well as potentially facilitating risks. We first analyzed posts from 1,160 youth who self-identified as CHINS while sharing about their adverse experiences. Then, we retrieved all 239,929 posts by these users to identify salient topics within their support-seeking posts: 1) Urges to self-harm due to social drama, 2) desire for social connection, 3) struggles with family, and 4) substance use and sexual risks. We found that the internet often helped facilitate these problems; for example, the desperation for social connection often led to meeting unsafe people online, causing additional trauma. Family members and other unsafe people used the internet to perpetrate cyberabuse, while CHINS themselves leveraged online channels to engage in illegal and risky behavior. Our study calls for tailored support systems that address the unique needs of CHINS to promote safe online spaces and foster resilience to break the cycle of adversity. Empowering CHINS requires amplifying their voices and acknowledging the challenges they face as a result of their adverse childhood experiences. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16507v1-abstract-full').style.display = 'none'; document.getElementById('2410.16507v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.15126">arXiv:2410.15126</a> <span> [<a href="https://arxiv.org/pdf/2410.15126">pdf</a>, <a href="https://arxiv.org/format/2410.15126">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> MELT: Materials-aware Continued Pre-training for Language Model Adaptation to Materials Science </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kim%2C+J">Junho Kim</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+Y">Yeachan Kim</a>, <a href="/search/cs?searchtype=author&query=Park%2C+J">Jun-Hyung Park</a>, <a href="/search/cs?searchtype=author&query=Oh%2C+Y">Yerim Oh</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+S">Suho Kim</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+S">SangKeun Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.15126v1-abstract-short" style="display: inline;"> We introduce a novel continued pre-training method, MELT (MatEriaLs-aware continued pre-Training), specifically designed to efficiently adapt the pre-trained language models (PLMs) for materials science. Unlike previous adaptation strategies that solely focus on constructing domain-specific corpus, MELT comprehensively considers both the corpus and the training strategy, given that materials scien… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15126v1-abstract-full').style.display = 'inline'; document.getElementById('2410.15126v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.15126v1-abstract-full" style="display: none;"> We introduce a novel continued pre-training method, MELT (MatEriaLs-aware continued pre-Training), specifically designed to efficiently adapt the pre-trained language models (PLMs) for materials science. Unlike previous adaptation strategies that solely focus on constructing domain-specific corpus, MELT comprehensively considers both the corpus and the training strategy, given that materials science corpus has distinct characteristics from other domains. To this end, we first construct a comprehensive materials knowledge base from the scientific corpus by building semantic graphs. Leveraging this extracted knowledge, we integrate a curriculum into the adaptation process that begins with familiar and generalized concepts and progressively moves toward more specialized terms. We conduct extensive experiments across diverse benchmarks to verify the effectiveness and generality of MELT. A comprehensive evaluation convincingly supports the strength of MELT, demonstrating superior performance compared to existing continued pre-training methods. The in-depth analysis also shows that MELT enables PLMs to effectively represent materials entities compared to the existing adaptation methods, thereby highlighting its broad applicability across a wide spectrum of materials science. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15126v1-abstract-full').style.display = 'none'; document.getElementById('2410.15126v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at EMNLP 2024 (Findings)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.15008">arXiv:2410.15008</a> <span> [<a href="https://arxiv.org/pdf/2410.15008">pdf</a>, <a href="https://arxiv.org/format/2410.15008">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3620666.3651324">10.1145/3620666.3651324 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> IANUS: Integrated Accelerator based on NPU-PIM Unified Memory System </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Seo%2C+M">Minseok Seo</a>, <a href="/search/cs?searchtype=author&query=Nguyen%2C+X+T">Xuan Truong Nguyen</a>, <a href="/search/cs?searchtype=author&query=Hwang%2C+S+J">Seok Joong Hwang</a>, <a href="/search/cs?searchtype=author&query=Kwon%2C+Y">Yongkee Kwon</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+G">Guhyun Kim</a>, <a href="/search/cs?searchtype=author&query=Park%2C+C">Chanwook Park</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+I">Ilkon Kim</a>, <a href="/search/cs?searchtype=author&query=Park%2C+J">Jaehan Park</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+J">Jeongbin Kim</a>, <a href="/search/cs?searchtype=author&query=Shin%2C+W">Woojae Shin</a>, <a href="/search/cs?searchtype=author&query=Won%2C+J">Jongsoon Won</a>, <a href="/search/cs?searchtype=author&query=Choi%2C+H">Haerang Choi</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+K">Kyuyoung Kim</a>, <a href="/search/cs?searchtype=author&query=Kwon%2C+D">Daehan Kwon</a>, <a href="/search/cs?searchtype=author&query=Jeong%2C+C">Chunseok Jeong</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+S">Sangheon Lee</a>, <a href="/search/cs?searchtype=author&query=Choi%2C+Y">Yongseok Choi</a>, <a href="/search/cs?searchtype=author&query=Byun%2C+W">Wooseok Byun</a>, <a href="/search/cs?searchtype=author&query=Baek%2C+S">Seungcheol Baek</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+H">Hyuk-Jae Lee</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+J">John Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.15008v1-abstract-short" style="display: inline;"> Accelerating end-to-end inference of transformer-based large language models (LLMs) is a critical component of AI services in datacenters. However, diverse compute characteristics of end-to-end LLM inference present challenges as previously proposed accelerators only address certain operations or stages (e.g., self-attention, generation stage, etc.). To address the unique challenges of acceleratin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15008v1-abstract-full').style.display = 'inline'; document.getElementById('2410.15008v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.15008v1-abstract-full" style="display: none;"> Accelerating end-to-end inference of transformer-based large language models (LLMs) is a critical component of AI services in datacenters. However, diverse compute characteristics of end-to-end LLM inference present challenges as previously proposed accelerators only address certain operations or stages (e.g., self-attention, generation stage, etc.). To address the unique challenges of accelerating end-to-end inference, we propose IANUS -- Integrated Accelerator based on NPU-PIM Unified Memory System. IANUS is a domain-specific system architecture that combines a Neural Processing Unit (NPU) with a Processing-in-Memory (PIM) to leverage both the NPU's high computation throughput and the PIM's high effective memory bandwidth. In particular, IANUS employs a unified main memory system where the PIM memory is used both for PIM operations and for NPU's main memory. The unified main memory system ensures that memory capacity is efficiently utilized and the movement of shared data between NPU and PIM is minimized. However, it introduces new challenges since normal memory accesses and PIM computations cannot be performed simultaneously. Thus, we propose novel PIM Access Scheduling that manages normal memory accesses and PIM computations through workload mapping and scheduling across the PIM and the NPU. Our detailed simulation evaluations show that IANUS improves the performance of GPT-2 by 6.2$\times$ and 3.2$\times$, on average, compared to the NVIDIA A100 GPU and the state-of-the-art accelerator. As a proof-of-concept, we develop a prototype of IANUS with a commercial PIM, NPU, and an FPGA-based PIM controller to demonstrate the feasibility of IANUS. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15008v1-abstract-full').style.display = 'none'; document.getElementById('2410.15008v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Updated version of the paper accepted to ASPLOS 2024</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> ASPLOS 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.14017">arXiv:2410.14017</a> <span> [<a href="https://arxiv.org/pdf/2410.14017">pdf</a>, <a href="https://arxiv.org/format/2410.14017">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Probabilistic U-Net with Kendall Shape Spaces for Geometry-Aware Segmentations of Images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Park%2C+J">Jiyoung Park</a>, <a href="/search/cs?searchtype=author&query=Do%C4%9Fan%2C+G">G眉nay Do臒an</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.14017v1-abstract-short" style="display: inline;"> One of the fundamental problems in computer vision is image segmentation, the task of detecting distinct regions or objects in given images. Deep Neural Networks (DNN) have been shown to be very effective in segmenting challenging images, producing convincing segmentations. There is further need for probabilistic DNNs that can reflect the uncertainties from the input images and the models into the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14017v1-abstract-full').style.display = 'inline'; document.getElementById('2410.14017v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.14017v1-abstract-full" style="display: none;"> One of the fundamental problems in computer vision is image segmentation, the task of detecting distinct regions or objects in given images. Deep Neural Networks (DNN) have been shown to be very effective in segmenting challenging images, producing convincing segmentations. There is further need for probabilistic DNNs that can reflect the uncertainties from the input images and the models into the computed segmentations, in other words, new DNNs that can generate multiple plausible segmentations and their distributions depending on the input or the model uncertainties. While there are existing probabilistic segmentation models, many of them do not take into account the geometry or shape underlying the segmented regions. In this paper, we propose a probabilistic image segmentation model that can incorporate the geometry of a segmentation. Our proposed model builds on the Probabilistic U-Net of \cite{kohl2018probabilistic} to generate probabilistic segmentations, i.e.\! multiple likely segmentations for an input image. Our model also adopts the Kendall Shape Variational Auto-Encoder of \cite{vadgama2023kendall} to encode a Kendall shape space in the latent variable layers of the prior and posterior networks of the Probabilistic U-Net. Incorporating the shape space in this manner leads to a more robust segmentation with spatially coherent regions, respecting the underlying geometry in the input images. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14017v1-abstract-full').style.display = 'none'; document.getElementById('2410.14017v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">22 pages, 13 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.13839">arXiv:2410.13839</a> <span> [<a href="https://arxiv.org/pdf/2410.13839">pdf</a>, <a href="https://arxiv.org/format/2410.13839">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Accelerating Codec-based Speech Synthesis with Multi-Token Prediction and Speculative Decoding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Nguyen%2C+T+D">Tan Dat Nguyen</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+J">Ji-Hoon Kim</a>, <a href="/search/cs?searchtype=author&query=Choi%2C+J">Jeongsoo Choi</a>, <a href="/search/cs?searchtype=author&query=Choi%2C+S">Shukjae Choi</a>, <a href="/search/cs?searchtype=author&query=Park%2C+J">Jinseok Park</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+Y">Younglo Lee</a>, <a href="/search/cs?searchtype=author&query=Chung%2C+J+S">Joon Son Chung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.13839v1-abstract-short" style="display: inline;"> The goal of this paper is to accelerate codec-based speech synthesis systems with minimum sacrifice to speech quality. We propose an enhanced inference method that allows for flexible trade-offs between speed and quality during inference without requiring additional training. Our core idea is to predict multiple tokens per inference step of the AR module using multiple prediction heads, resulting… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13839v1-abstract-full').style.display = 'inline'; document.getElementById('2410.13839v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.13839v1-abstract-full" style="display: none;"> The goal of this paper is to accelerate codec-based speech synthesis systems with minimum sacrifice to speech quality. We propose an enhanced inference method that allows for flexible trade-offs between speed and quality during inference without requiring additional training. Our core idea is to predict multiple tokens per inference step of the AR module using multiple prediction heads, resulting in a linear reduction in synthesis time as the number of heads increases. Furthermore, we introduce a novel speculative decoding technique that utilises a Viterbi-based algorithm to select the optimal sequence of generated tokens at each decoding step. In our experiments, we demonstrate that the time required to predict each token is reduced by a factor of 4 to 5 compared to baseline models, with minimal quality trade-off or even improvement in terms of speech intelligibility. Audio samples are available at: multpletokensprediction.github.io/multipletokensprediction.github.io/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13839v1-abstract-full').style.display = 'none'; document.getElementById('2410.13839v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to IEEE ICASSP 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.12750">arXiv:2410.12750</a> <span> [<a href="https://arxiv.org/pdf/2410.12750">pdf</a>, <a href="https://arxiv.org/ps/2410.12750">ps</a>, <a href="https://arxiv.org/format/2410.12750">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Comparative Analysis of Extrinsic Factors for NER in French </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+G">Grace Yang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhiyi Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yadong Liu</a>, <a href="/search/cs?searchtype=author&query=Park%2C+J">Jungyeul Park</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.12750v2-abstract-short" style="display: inline;"> Named entity recognition (NER) is a crucial task that aims to identify structured information, which is often replete with complex, technical terms and a high degree of variability. Accurate and reliable NER can facilitate the extraction and analysis of important information. However, NER for other than English is challenging due to limited data availability, as the high expertise, time, and expen… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12750v2-abstract-full').style.display = 'inline'; document.getElementById('2410.12750v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.12750v2-abstract-full" style="display: none;"> Named entity recognition (NER) is a crucial task that aims to identify structured information, which is often replete with complex, technical terms and a high degree of variability. Accurate and reliable NER can facilitate the extraction and analysis of important information. However, NER for other than English is challenging due to limited data availability, as the high expertise, time, and expenses are required to annotate its data. In this paper, by using the limited data, we explore various factors including model structure, corpus annotation scheme and data augmentation techniques to improve the performance of a NER model for French. Our experiments demonstrate that these approaches can significantly improve the model's F1 score from original CRF score of 62.41 to 79.39. Our findings suggest that considering different extrinsic factors and combining these techniques is a promising approach for improving NER performance where the size of data is limited. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12750v2-abstract-full').style.display = 'none'; document.getElementById('2410.12750v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.12692">arXiv:2410.12692</a> <span> [<a href="https://arxiv.org/pdf/2410.12692">pdf</a>, <a href="https://arxiv.org/format/2410.12692">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Machine learning approach to brain tumor detection and classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Oh%2C+A">Alice Oh</a>, <a href="/search/cs?searchtype=author&query=Noh%2C+I">Inyoung Noh</a>, <a href="/search/cs?searchtype=author&query=Choo%2C+J">Jian Choo</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+J">Jihoo Lee</a>, <a href="/search/cs?searchtype=author&query=Park%2C+J">Justin Park</a>, <a href="/search/cs?searchtype=author&query=Hwang%2C+K">Kate Hwang</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+S">Sanghyeon Kim</a>, <a href="/search/cs?searchtype=author&query=Oh%2C+S+M">Soo Min Oh</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.12692v2-abstract-short" style="display: inline;"> Brain tumor detection and classification are critical tasks in medical image analysis, particularly in early-stage diagnosis, where accurate and timely detection can significantly improve treatment outcomes. In this study, we apply various statistical and machine learning models to detect and classify brain tumors using brain MRI images. We explore a variety of statistical models including linear,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12692v2-abstract-full').style.display = 'inline'; document.getElementById('2410.12692v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.12692v2-abstract-full" style="display: none;"> Brain tumor detection and classification are critical tasks in medical image analysis, particularly in early-stage diagnosis, where accurate and timely detection can significantly improve treatment outcomes. In this study, we apply various statistical and machine learning models to detect and classify brain tumors using brain MRI images. We explore a variety of statistical models including linear, logistic, and Bayesian regressions, and the machine learning models including decision tree, random forest, single-layer perceptron, multi-layer perceptron, convolutional neural network (CNN), recurrent neural network, and long short-term memory. Our findings show that CNN outperforms other models, achieving the best performance. Additionally, we confirm that the CNN model can also work for multi-class classification, distinguishing between four categories of brain MRI images such as normal, glioma, meningioma, and pituitary tumor images. This study demonstrates that machine learning approaches are suitable for brain tumor detection and classification, facilitating real-world medical applications in assisting radiologists with early and accurate diagnosis. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12692v2-abstract-full').style.display = 'none'; document.getElementById('2410.12692v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages, 2 figures, 2 tables</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Park%2C+J&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Park%2C+J&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Park%2C+J&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Park%2C+J&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Park%2C+J&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Park%2C+J&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository