Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 222 results for author: <span class="mathjax">Park, D</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Park%2C+D">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Park, D"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Park%2C+D&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Park, D"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Park%2C+D&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Park%2C+D&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Park%2C+D&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Park%2C+D&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Park%2C+D&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Park%2C+D&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.10995">arXiv:2502.10995</a> <span> [<a href="https://arxiv.org/pdf/2502.10995">pdf</a>, <a href="https://arxiv.org/format/2502.10995">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Evaluating Large language models on Understanding Korean indirect Speech acts </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Koo%2C+Y">Youngeun Koo</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+J">Jiwoo Lee</a>, <a href="/search/cs?searchtype=author&query=Park%2C+D">Dojun Park</a>, <a href="/search/cs?searchtype=author&query=Park%2C+S">Seohyun Park</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+S">Sungeun Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.10995v1-abstract-short" style="display: inline;"> To accurately understand the intention of an utterance is crucial in conversational communication. As conversational artificial intelligence models are rapidly being developed and applied in various fields, it is important to evaluate the LLMs' capabilities of understanding the intentions of user's utterance. This study evaluates whether current LLMs can understand the intention of an utterance by… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10995v1-abstract-full').style.display = 'inline'; document.getElementById('2502.10995v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.10995v1-abstract-full" style="display: none;"> To accurately understand the intention of an utterance is crucial in conversational communication. As conversational artificial intelligence models are rapidly being developed and applied in various fields, it is important to evaluate the LLMs' capabilities of understanding the intentions of user's utterance. This study evaluates whether current LLMs can understand the intention of an utterance by considering the given conversational context, particularly in cases where the actual intention differs from the surface-leveled, literal intention of the sentence, i.e. indirect speech acts. Our findings reveal that Claude3-Opus outperformed the other competing models, with 71.94% in MCQ and 65% in OEQ, showing a clear advantage. In general, proprietary models exhibited relatively higher performance compared to open-source models. Nevertheless, no LLMs reached the level of human performance. Most LLMs, except for Claude3-Opus, demonstrated significantly lower performance in understanding indirect speech acts compared to direct speech acts, where the intention is explicitly revealed through the utterance. This study not only performs an overall pragmatic evaluation of each LLM's language use through the analysis of OEQ response patterns, but also emphasizes the necessity for further research to improve LLMs' understanding of indirect speech acts for more natural communication with humans. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10995v1-abstract-full').style.display = 'none'; document.getElementById('2502.10995v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">under review (15 pages)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06542">arXiv:2502.06542</a> <span> [<a href="https://arxiv.org/pdf/2502.06542">pdf</a>, <a href="https://arxiv.org/format/2502.06542">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantum Physics">quant-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Emerging Technologies">cs.ET</span> </div> </div> <p class="title is-5 mathjax"> Hamiltonian formulations of centroid-based clustering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Seong%2C+M">Myeonghwan Seong</a>, <a href="/search/cs?searchtype=author&query=Park%2C+D+K">Daniel K. Park</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06542v1-abstract-short" style="display: inline;"> Clustering is a fundamental task in data science that aims to group data based on their similarities. However, defining similarity is often ambiguous, making it challenging to determine the most appropriate objective function for a given dataset. Traditional clustering methods, such as the $k$-means algorithm and weighted maximum $k$-cut, focus on specific objectives -- typically relying on averag… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06542v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06542v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06542v1-abstract-full" style="display: none;"> Clustering is a fundamental task in data science that aims to group data based on their similarities. However, defining similarity is often ambiguous, making it challenging to determine the most appropriate objective function for a given dataset. Traditional clustering methods, such as the $k$-means algorithm and weighted maximum $k$-cut, focus on specific objectives -- typically relying on average or pairwise characteristics of the data -- leading to performance that is highly data-dependent. Moreover, incorporating practical constraints into clustering objectives is not straightforward, and these problems are known to be NP-hard. In this study, we formulate the clustering problem as a search for the ground state of a Hamiltonian, providing greater flexibility in defining clustering objectives and incorporating constraints. This approach enables the application of various quantum simulation techniques, including both circuit-based quantum computation and quantum annealing, thereby opening a path toward quantum advantage in solving clustering problems. We propose various Hamiltonians to accommodate different clustering objectives, including the ability to combine multiple objectives and incorporate constraints. We evaluate the clustering performance through numerical simulations and implementations on the D-Wave quantum annealer. The results demonstrate the broad applicability of our approach to a variety of clustering problems on current quantum devices. Furthermore, we find that Hamiltonians designed for specific clustering objectives and constraints impose different requirements for qubit connectivity, indicating that certain clustering tasks are better suited to specific quantum hardware. Our experimental results highlight this by identifying the Hamiltonian that optimally utilizes the physical qubits available in the D-Wave System. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06542v1-abstract-full').style.display = 'none'; document.getElementById('2502.06542v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages, 8 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05295">arXiv:2502.05295</a> <span> [<a href="https://arxiv.org/pdf/2502.05295">pdf</a>, <a href="https://arxiv.org/format/2502.05295">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Methodology">stat.ME</span> </div> </div> <p class="title is-5 mathjax"> GST-UNet: Spatiotemporal Causal Inference with Time-Varying Confounders </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Oprescu%2C+M">Miruna Oprescu</a>, <a href="/search/cs?searchtype=author&query=Park%2C+D+K">David K. Park</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+X">Xihaier Luo</a>, <a href="/search/cs?searchtype=author&query=Yoo%2C+S">Shinjae Yoo</a>, <a href="/search/cs?searchtype=author&query=Kallus%2C+N">Nathan Kallus</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05295v1-abstract-short" style="display: inline;"> Estimating causal effects from spatiotemporal data is a key challenge in fields such as public health, social policy, and environmental science, where controlled experiments are often infeasible. However, existing causal inference methods relying on observational data face significant limitations: they depend on strong structural assumptions to address spatiotemporal challenges $\unicode{x2013}$ s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05295v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05295v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05295v1-abstract-full" style="display: none;"> Estimating causal effects from spatiotemporal data is a key challenge in fields such as public health, social policy, and environmental science, where controlled experiments are often infeasible. However, existing causal inference methods relying on observational data face significant limitations: they depend on strong structural assumptions to address spatiotemporal challenges $\unicode{x2013}$ such as interference, spatial confounding, and temporal carryover effects $\unicode{x2013}$ or fail to account for $\textit{time-varying confounders}$. These confounders, influenced by past treatments and outcomes, can themselves shape future treatments and outcomes, creating feedback loops that complicate traditional adjustment strategies. To address these challenges, we introduce the $\textbf{GST-UNet}$ ($\textbf{G}$-computation $\textbf{S}$patio-$\textbf{T}$emporal $\textbf{UNet}$), a novel end-to-end neural network framework designed to estimate treatment effects in complex spatial and temporal settings. The GST-UNet leverages regression-based iterative G-computation to explicitly adjust for time-varying confounders, providing valid estimates of potential outcomes and treatment effects. To the best of our knowledge, the GST-UNet is the first neural model to account for complex, non-linear dynamics and time-varying confounders in spatiotemporal interventions. We demonstrate the effectiveness of the GST-UNet through extensive simulation studies and showcase its practical utility with a real-world analysis of the impact of wildfire smoke on respiratory hospitalizations during the 2018 California Camp Fire. Our results highlight the potential of GST-UNet to advance spatiotemporal causal inference across a wide range of policy-driven and scientific applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05295v1-abstract-full').style.display = 'none'; document.getElementById('2502.05295v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages, 6 figures, 2 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.00261">arXiv:2502.00261</a> <span> [<a href="https://arxiv.org/pdf/2502.00261">pdf</a>, <a href="https://arxiv.org/format/2502.00261">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> Alternative Mixed Integer Linear Programming Optimization for Joint Job Scheduling and Data Allocation in Grid Computing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Feng%2C+S">Shengyu Feng</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+J">Jaehyung Kim</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yiming Yang</a>, <a href="/search/cs?searchtype=author&query=Boudreau%2C+J">Joseph Boudreau</a>, <a href="/search/cs?searchtype=author&query=Chowdhury%2C+T">Tasnuva Chowdhury</a>, <a href="/search/cs?searchtype=author&query=Hoisie%2C+A">Adolfy Hoisie</a>, <a href="/search/cs?searchtype=author&query=Khan%2C+R">Raees Khan</a>, <a href="/search/cs?searchtype=author&query=Kilic%2C+O+O">Ozgur O. Kilic</a>, <a href="/search/cs?searchtype=author&query=Klasky%2C+S">Scott Klasky</a>, <a href="/search/cs?searchtype=author&query=Korchuganova%2C+T">Tatiana Korchuganova</a>, <a href="/search/cs?searchtype=author&query=Nilsson%2C+P">Paul Nilsson</a>, <a href="/search/cs?searchtype=author&query=Outschoorn%2C+V+I+M">Verena Ingrid Martinez Outschoorn</a>, <a href="/search/cs?searchtype=author&query=Park%2C+D+K">David K. Park</a>, <a href="/search/cs?searchtype=author&query=Podhorszki%2C+N">Norbert Podhorszki</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+Y">Yihui Ren</a>, <a href="/search/cs?searchtype=author&query=Suter%2C+F">Frederic Suter</a>, <a href="/search/cs?searchtype=author&query=Vatsavai%2C+S+S">Sairam Sri Vatsavai</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+W">Wei Yang</a>, <a href="/search/cs?searchtype=author&query=Yoo%2C+S">Shinjae Yoo</a>, <a href="/search/cs?searchtype=author&query=Maeno%2C+T">Tadashi Maeno</a>, <a href="/search/cs?searchtype=author&query=Klimentov%2C+A">Alexei Klimentov</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.00261v1-abstract-short" style="display: inline;"> This paper presents a novel approach to the joint optimization of job scheduling and data allocation in grid computing environments. We formulate this joint optimization problem as a mixed integer quadratically constrained program. To tackle the nonlinearity in the constraint, we alternatively fix a subset of decision variables and optimize the remaining ones via Mixed Integer Linear Programming (… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00261v1-abstract-full').style.display = 'inline'; document.getElementById('2502.00261v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.00261v1-abstract-full" style="display: none;"> This paper presents a novel approach to the joint optimization of job scheduling and data allocation in grid computing environments. We formulate this joint optimization problem as a mixed integer quadratically constrained program. To tackle the nonlinearity in the constraint, we alternatively fix a subset of decision variables and optimize the remaining ones via Mixed Integer Linear Programming (MILP). We solve the MILP problem at each iteration via an off-the-shelf MILP solver. Our experimental results show that our method significantly outperforms existing heuristic methods, employing either independent optimization or joint optimization strategies. We have also verified the generalization ability of our method over grid environments with various sizes and its high robustness to the algorithm hyper-parameters. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00261v1-abstract-full').style.display = 'none'; document.getElementById('2502.00261v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.14654">arXiv:2501.14654</a> <span> [<a href="https://arxiv.org/pdf/2501.14654">pdf</a>, <a href="https://arxiv.org/format/2501.14654">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> </div> <p class="title is-5 mathjax"> MedAgentBench: A Realistic Virtual EHR Environment to Benchmark Medical LLM Agents </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+Y">Yixing Jiang</a>, <a href="/search/cs?searchtype=author&query=Black%2C+K+C">Kameron C. Black</a>, <a href="/search/cs?searchtype=author&query=Geng%2C+G">Gloria Geng</a>, <a href="/search/cs?searchtype=author&query=Park%2C+D">Danny Park</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+J">James Zou</a>, <a href="/search/cs?searchtype=author&query=Ng%2C+A+Y">Andrew Y. Ng</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J+H">Jonathan H. Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.14654v2-abstract-short" style="display: inline;"> Recent large language models (LLMs) have demonstrated significant advancements, particularly in their ability to serve as agents thereby surpassing their traditional role as chatbots. These agents can leverage their planning and tool utilization capabilities to address tasks specified at a high level. However, a standardized dataset to benchmark the agent capabilities of LLMs in medical applicatio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.14654v2-abstract-full').style.display = 'inline'; document.getElementById('2501.14654v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.14654v2-abstract-full" style="display: none;"> Recent large language models (LLMs) have demonstrated significant advancements, particularly in their ability to serve as agents thereby surpassing their traditional role as chatbots. These agents can leverage their planning and tool utilization capabilities to address tasks specified at a high level. However, a standardized dataset to benchmark the agent capabilities of LLMs in medical applications is currently lacking, making the evaluation of LLMs on complex tasks in interactive healthcare environments challenging. To address this gap, we introduce MedAgentBench, a broad evaluation suite designed to assess the agent capabilities of large language models within medical records contexts. MedAgentBench encompasses 300 patient-specific clinically-derived tasks from 10 categories written by human physicians, realistic profiles of 100 patients with over 700,000 data elements, a FHIR-compliant interactive environment, and an accompanying codebase. The environment uses the standard APIs and communication infrastructure used in modern EMR systems, so it can be easily migrated into live EMR systems. MedAgentBench presents an unsaturated agent-oriented benchmark that current state-of-the-art LLMs exhibit some ability to succeed at. The best model (Claude 3.5 Sonnet v2) achieves a success rate of 69.67%. However, there is still substantial space for improvement which gives the community a next direction to optimize. Furthermore, there is significant variation in performance across task categories. MedAgentBench establishes this and is publicly available at https://github.com/stanfordmlgroup/MedAgentBench , offering a valuable framework for model developers to track progress and drive continuous improvements in the agent capabilities of large language models within the medical domain. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.14654v2-abstract-full').style.display = 'none'; document.getElementById('2501.14654v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.14278">arXiv:2501.14278</a> <span> [<a href="https://arxiv.org/pdf/2501.14278">pdf</a>, <a href="https://arxiv.org/format/2501.14278">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Active Learning for Continual Learning: Keeping the Past Alive in the Present </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Park%2C+J">Jaehyun Park</a>, <a href="/search/cs?searchtype=author&query=Park%2C+D">Dongmin Park</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+J">Jae-Gil Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.14278v1-abstract-short" style="display: inline;"> Continual learning (CL) enables deep neural networks to adapt to ever-changing data distributions. In practice, there may be scenarios where annotation is costly, leading to active continual learning (ACL), which performs active learning (AL) for the CL scenarios when reducing the labeling cost by selecting the most informative subset is preferable. However, conventional AL strategies are not suit… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.14278v1-abstract-full').style.display = 'inline'; document.getElementById('2501.14278v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.14278v1-abstract-full" style="display: none;"> Continual learning (CL) enables deep neural networks to adapt to ever-changing data distributions. In practice, there may be scenarios where annotation is costly, leading to active continual learning (ACL), which performs active learning (AL) for the CL scenarios when reducing the labeling cost by selecting the most informative subset is preferable. However, conventional AL strategies are not suitable for ACL, as they focus solely on learning the new knowledge, leading to catastrophic forgetting of previously learned tasks. Therefore, ACL requires a new AL strategy that can balance the prevention of catastrophic forgetting and the ability to quickly learn new tasks. In this paper, we propose AccuACL, Accumulated informativeness-based Active Continual Learning, by the novel use of the Fisher information matrix as a criterion for sample selection, derived from a theoretical analysis of the Fisher-optimality preservation properties within the framework of ACL, while also addressing the scalability issue of Fisher information-based AL. Extensive experiments demonstrate that AccuACL significantly outperforms AL baselines across various CL algorithms, increasing the average accuracy and forgetting by 23.8% and 17.0%, respectively, in average. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.14278v1-abstract-full').style.display = 'none'; document.getElementById('2501.14278v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.05803">arXiv:2501.05803</a> <span> [<a href="https://arxiv.org/pdf/2501.05803">pdf</a>, <a href="https://arxiv.org/format/2501.05803">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Statistics Theory">math.ST</span> </div> </div> <p class="title is-5 mathjax"> Test-time Alignment of Diffusion Models without Reward Over-optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kim%2C+S">Sunwoo Kim</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+M">Minkyu Kim</a>, <a href="/search/cs?searchtype=author&query=Park%2C+D">Dongmin Park</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.05803v2-abstract-short" style="display: inline;"> Diffusion models excel in generative tasks, but aligning them with specific objectives while maintaining their versatility remains challenging. Existing fine-tuning methods often suffer from reward over-optimization, while approximate guidance approaches fail to optimize target rewards effectively. Addressing these limitations, we propose a training-free, test-time method based on Sequential Monte… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.05803v2-abstract-full').style.display = 'inline'; document.getElementById('2501.05803v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.05803v2-abstract-full" style="display: none;"> Diffusion models excel in generative tasks, but aligning them with specific objectives while maintaining their versatility remains challenging. Existing fine-tuning methods often suffer from reward over-optimization, while approximate guidance approaches fail to optimize target rewards effectively. Addressing these limitations, we propose a training-free, test-time method based on Sequential Monte Carlo (SMC) to sample from the reward-aligned target distribution. Our approach, tailored for diffusion sampling and incorporating tempering techniques, achieves comparable or superior target rewards to fine-tuning methods while preserving diversity and cross-reward generalization. We demonstrate its effectiveness in single-reward optimization, multi-objective scenarios, and online black-box optimization. This work offers a robust solution for aligning diffusion models with diverse downstream objectives without compromising their general capabilities. Code is available at https://github.com/krafton-ai/DAS. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.05803v2-abstract-full').style.display = 'none'; document.getElementById('2501.05803v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICLR 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.02687">arXiv:2501.02687</a> <span> [<a href="https://arxiv.org/pdf/2501.02687">pdf</a>, <a href="https://arxiv.org/format/2501.02687">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantum Physics">quant-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Improving Quantum Machine Learning via Heat-Bath Algorithmic Cooling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Rodr%C3%ADguez-Briones%2C+N+A">Nayeli A. Rodr铆guez-Briones</a>, <a href="/search/cs?searchtype=author&query=Park%2C+D+K">Daniel K. Park</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.02687v1-abstract-short" style="display: inline;"> This work introduces an approach rooted in quantum thermodynamics to enhance sampling efficiency in quantum machine learning (QML). We propose conceptualizing quantum supervised learning as a thermodynamic cooling process. Building on this concept, we develop a quantum refrigerator protocol that enhances sample efficiency during training and prediction without the need for Grover iterations or qua… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02687v1-abstract-full').style.display = 'inline'; document.getElementById('2501.02687v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.02687v1-abstract-full" style="display: none;"> This work introduces an approach rooted in quantum thermodynamics to enhance sampling efficiency in quantum machine learning (QML). We propose conceptualizing quantum supervised learning as a thermodynamic cooling process. Building on this concept, we develop a quantum refrigerator protocol that enhances sample efficiency during training and prediction without the need for Grover iterations or quantum phase estimation. Inspired by heat-bath algorithmic cooling protocols, our method alternates entropy compression and thermalization steps to decrease the entropy of qubits, increasing polarization towards the dominant bias. This technique minimizes the computational overhead associated with estimating classification scores and gradients, presenting a practical and efficient solution for QML algorithms compatible with noisy intermediate-scale quantum devices. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02687v1-abstract-full').style.display = 'none'; document.getElementById('2501.02687v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages, 7 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.16337">arXiv:2412.16337</a> <span> [<a href="https://arxiv.org/pdf/2412.16337">pdf</a>, <a href="https://arxiv.org/format/2412.16337">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantum Physics">quant-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Emerging Technologies">cs.ET</span> </div> </div> <p class="title is-5 mathjax"> Schmidt quantum compressor </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Araujo%2C+I+F">Israel F. Araujo</a>, <a href="/search/cs?searchtype=author&query=Oh%2C+H">Hyeondo Oh</a>, <a href="/search/cs?searchtype=author&query=Rodr%C3%ADguez-Briones%2C+N+A">Nayeli A. Rodr铆guez-Briones</a>, <a href="/search/cs?searchtype=author&query=Park%2C+D+K">Daniel K. Park</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.16337v1-abstract-short" style="display: inline;"> This work introduces the Schmidt quantum compressor, an innovative approach to quantum data compression that leverages the principles of Schmidt decomposition to encode quantum information efficiently. In contrast to traditional variational quantum autoencoders, which depend on stochastic optimization and face challenges such as shot noise, barren plateaus, and non-convex optimization landscapes,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16337v1-abstract-full').style.display = 'inline'; document.getElementById('2412.16337v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.16337v1-abstract-full" style="display: none;"> This work introduces the Schmidt quantum compressor, an innovative approach to quantum data compression that leverages the principles of Schmidt decomposition to encode quantum information efficiently. In contrast to traditional variational quantum autoencoders, which depend on stochastic optimization and face challenges such as shot noise, barren plateaus, and non-convex optimization landscapes, our deterministic method substantially reduces the complexity and computational overhead of quantum data compression. We evaluate the performance of the compressor through numerical experiments, demonstrating its ability to achieve high fidelity in quantum state reconstruction compared to variational quantum algorithms. Furthermore, we demonstrate the practical utility of the Schmidt quantum compressor in one-class classification tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16337v1-abstract-full').style.display = 'none'; document.getElementById('2412.16337v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.14220">arXiv:2412.14220</a> <span> [<a href="https://arxiv.org/pdf/2412.14220">pdf</a>, <a href="https://arxiv.org/format/2412.14220">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Distilled Pooling Transformer Encoder for Efficient Realistic Image Dehazing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tran%2C+L">Le-Anh Tran</a>, <a href="/search/cs?searchtype=author&query=Park%2C+D">Dong-Chul Park</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.14220v1-abstract-short" style="display: inline;"> This paper proposes a lightweight neural network designed for realistic image dehazing, utilizing a Distilled Pooling Transformer Encoder, named DPTE-Net. Recently, while vision transformers (ViTs) have achieved great success in various vision tasks, their self-attention (SA) module's complexity scales quadratically with image resolution, hindering their applicability on resource-constrained devic… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.14220v1-abstract-full').style.display = 'inline'; document.getElementById('2412.14220v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.14220v1-abstract-full" style="display: none;"> This paper proposes a lightweight neural network designed for realistic image dehazing, utilizing a Distilled Pooling Transformer Encoder, named DPTE-Net. Recently, while vision transformers (ViTs) have achieved great success in various vision tasks, their self-attention (SA) module's complexity scales quadratically with image resolution, hindering their applicability on resource-constrained devices. To overcome this, the proposed DPTE-Net substitutes traditional SA modules with efficient pooling mechanisms, significantly reducing computational demands while preserving ViTs' learning capabilities. To further enhance semantic feature learning, a distillation-based training process is implemented which transfers rich knowledge from a larger teacher network to DPTE-Net. Additionally, DPTE-Net is trained within a generative adversarial network (GAN) framework, leveraging the strong generalization of GAN in image restoration, and employs a transmission-aware loss function to dynamically adapt to varying haze densities. Experimental results on various benchmark datasets have shown that the proposed DPTE-Net can achieve competitive dehazing performance when compared to state-of-the-art methods while maintaining low computational complexity, making it a promising solution for resource-limited applications. The code of this work is available at https://github.com/tranleanh/dpte-net. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.14220v1-abstract-full').style.display = 'none'; document.getElementById('2412.14220v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">18 pages, 17 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.11525">arXiv:2412.11525</a> <span> [<a href="https://arxiv.org/pdf/2412.11525">pdf</a>, <a href="https://arxiv.org/format/2412.11525">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Sequence Matters: Harnessing Video Models in 3D Super-Resolution </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ko%2C+H">Hyun-kyu Ko</a>, <a href="/search/cs?searchtype=author&query=Park%2C+D">Dongheok Park</a>, <a href="/search/cs?searchtype=author&query=Park%2C+Y">Youngin Park</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+B">Byeonghyeon Lee</a>, <a href="/search/cs?searchtype=author&query=Han%2C+J">Juhee Han</a>, <a href="/search/cs?searchtype=author&query=Park%2C+E">Eunbyung Park</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.11525v3-abstract-short" style="display: inline;"> 3D super-resolution aims to reconstruct high-fidelity 3D models from low-resolution (LR) multi-view images. Early studies primarily focused on single-image super-resolution (SISR) models to upsample LR images into high-resolution images. However, these methods often lack view consistency because they operate independently on each image. Although various post-processing techniques have been extensi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.11525v3-abstract-full').style.display = 'inline'; document.getElementById('2412.11525v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.11525v3-abstract-full" style="display: none;"> 3D super-resolution aims to reconstruct high-fidelity 3D models from low-resolution (LR) multi-view images. Early studies primarily focused on single-image super-resolution (SISR) models to upsample LR images into high-resolution images. However, these methods often lack view consistency because they operate independently on each image. Although various post-processing techniques have been extensively explored to mitigate these inconsistencies, they have yet to fully resolve the issues. In this paper, we perform a comprehensive study of 3D super-resolution by leveraging video super-resolution (VSR) models. By utilizing VSR models, we ensure a higher degree of spatial consistency and can reference surrounding spatial information, leading to more accurate and detailed reconstructions. Our findings reveal that VSR models can perform remarkably well even on sequences that lack precise spatial alignment. Given this observation, we propose a simple yet practical approach to align LR images without involving fine-tuning or generating 'smooth' trajectory from the trained 3D models over LR images. The experimental results show that the surprisingly simple algorithms can achieve the state-of-the-art results of 3D super-resolution tasks on standard benchmark datasets, such as the NeRF-synthetic and MipNeRF-360 datasets. Project page: https://ko-lani.github.io/Sequence-Matters <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.11525v3-abstract-full').style.display = 'none'; document.getElementById('2412.11525v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://ko-lani.github.io/Sequence-Matters</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68U10; 68T10 <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.4.5; I.2.10 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.06303">arXiv:2412.06303</a> <span> [<a href="https://arxiv.org/pdf/2412.06303">pdf</a>, <a href="https://arxiv.org/format/2412.06303">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> DSAI: Unbiased and Interpretable Latent Feature Extraction for Data-Centric AI </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cho%2C+H">Hyowon Cho</a>, <a href="/search/cs?searchtype=author&query=Ka%2C+S">Soonwon Ka</a>, <a href="/search/cs?searchtype=author&query=Park%2C+D">Daechul Park</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+J">Jaewook Kang</a>, <a href="/search/cs?searchtype=author&query=Seo%2C+M">Minjoon Seo</a>, <a href="/search/cs?searchtype=author&query=Son%2C+B">Bokyung Son</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.06303v2-abstract-short" style="display: inline;"> Large language models (LLMs) often struggle to objectively identify latent characteristics in large datasets due to their reliance on pre-trained knowledge rather than actual data patterns. To address this data grounding issue, we propose Data Scientist AI (DSAI), a framework that enables unbiased and interpretable feature extraction through a multi-stage pipeline with quantifiable prominence metr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.06303v2-abstract-full').style.display = 'inline'; document.getElementById('2412.06303v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.06303v2-abstract-full" style="display: none;"> Large language models (LLMs) often struggle to objectively identify latent characteristics in large datasets due to their reliance on pre-trained knowledge rather than actual data patterns. To address this data grounding issue, we propose Data Scientist AI (DSAI), a framework that enables unbiased and interpretable feature extraction through a multi-stage pipeline with quantifiable prominence metrics for evaluating extracted features. On synthetic datasets with known ground-truth features, DSAI demonstrates high recall in identifying expert-defined features while faithfully reflecting the underlying data. Applications on real-world datasets illustrate the framework's practical utility in uncovering meaningful patterns with minimal expert oversight, supporting use cases such as interpretable classification. The title of our paper is chosen from multiple candidates based on DSAI-generated criteria. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.06303v2-abstract-full').style.display = 'none'; document.getElementById('2412.06303v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.04137">arXiv:2412.04137</a> <span> [<a href="https://arxiv.org/pdf/2412.04137">pdf</a>, <a href="https://arxiv.org/format/2412.04137">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Text Change Detection in Multilingual Documents Using Image Comparison </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Park%2C+D">Doyoung Park</a>, <a href="/search/cs?searchtype=author&query=Yarram%2C+N+R">Naresh Reddy Yarram</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+S">Sunjin Kim</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+M">Minkyu Kim</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+S">Seongho Cho</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+T">Taehee Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.04137v1-abstract-short" style="display: inline;"> Document comparison typically relies on optical character recognition (OCR) as its core technology. However, OCR requires the selection of appropriate language models for each document and the performance of multilingual or hybrid models remains limited. To overcome these challenges, we propose text change detection (TCD) using an image comparison model tailored for multilingual documents. Unlike… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.04137v1-abstract-full').style.display = 'inline'; document.getElementById('2412.04137v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.04137v1-abstract-full" style="display: none;"> Document comparison typically relies on optical character recognition (OCR) as its core technology. However, OCR requires the selection of appropriate language models for each document and the performance of multilingual or hybrid models remains limited. To overcome these challenges, we propose text change detection (TCD) using an image comparison model tailored for multilingual documents. Unlike OCR-based approaches, our method employs word-level text image-to-image comparison to detect changes. Our model generates bidirectional change segmentation maps between the source and target documents. To enhance performance without requiring explicit text alignment or scaling preprocessing, we employ correlations among multi-scale attention features. We also construct a benchmark dataset comprising actual printed and scanned word pairs in various languages to evaluate our model. We validate our approach using our benchmark dataset and public benchmarks Distorted Document Images and the LRDE Document Binarization Dataset. We compare our model against state-of-the-art semantic segmentation and change detection models, as well as to conventional OCR-based models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.04137v1-abstract-full').style.display = 'none'; document.getElementById('2412.04137v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15pages, 11figures 6tables, wacv2025 accepted</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06919">arXiv:2411.06919</a> <span> [<a href="https://arxiv.org/pdf/2411.06919">pdf</a>, <a href="https://arxiv.org/format/2411.06919">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantum Physics">quant-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Understanding Generalization in Quantum Machine Learning with Margins </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hur%2C+T">Tak Hur</a>, <a href="/search/cs?searchtype=author&query=Park%2C+D+K">Daniel K. Park</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06919v1-abstract-short" style="display: inline;"> Understanding and improving generalization capabilities is crucial for both classical and quantum machine learning (QML). Recent studies have revealed shortcomings in current generalization theories, particularly those relying on uniform bounds, across both classical and quantum settings. In this work, we present a margin-based generalization bound for QML models, providing a more reliable framewo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06919v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06919v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06919v1-abstract-full" style="display: none;"> Understanding and improving generalization capabilities is crucial for both classical and quantum machine learning (QML). Recent studies have revealed shortcomings in current generalization theories, particularly those relying on uniform bounds, across both classical and quantum settings. In this work, we present a margin-based generalization bound for QML models, providing a more reliable framework for evaluating generalization. Our experimental studies on the quantum phase recognition (QPR) dataset demonstrate that margin-based metrics are strong predictors of generalization performance, outperforming traditional metrics like parameter count. By connecting this margin-based metric to quantum information theory, we demonstrate how to enhance the generalization performance of QML through a classical-quantum hybrid approach when applied to classical data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06919v1-abstract-full').style.display = 'none'; document.getElementById('2411.06919v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">18 pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02751">arXiv:2411.02751</a> <span> [<a href="https://arxiv.org/pdf/2411.02751">pdf</a>, <a href="https://arxiv.org/format/2411.02751">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantum Physics">quant-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Expressivity of deterministic quantum computation with one qubit </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kim%2C+Y">Yujin Kim</a>, <a href="/search/cs?searchtype=author&query=Park%2C+D+K">Daniel K. Park</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02751v1-abstract-short" style="display: inline;"> Deterministic quantum computation with one qubit (DQC1) is of significant theoretical and practical interest due to its computational advantages in certain problems, despite its subuniversality with limited quantum resources. In this work, we introduce parameterized DQC1 as a quantum machine learning model. We demonstrate that the gradient of the measurement outcome of a DQC1 circuit with respect… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02751v1-abstract-full').style.display = 'inline'; document.getElementById('2411.02751v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02751v1-abstract-full" style="display: none;"> Deterministic quantum computation with one qubit (DQC1) is of significant theoretical and practical interest due to its computational advantages in certain problems, despite its subuniversality with limited quantum resources. In this work, we introduce parameterized DQC1 as a quantum machine learning model. We demonstrate that the gradient of the measurement outcome of a DQC1 circuit with respect to its gate parameters can be computed directly using the DQC1 protocol. This allows for gradient-based optimization of DQC1 circuits, positioning DQC1 as the sole quantum protocol for both training and inference. We then analyze the expressivity of the parameterized DQC1 circuits, characterizing the set of learnable functions, and show that DQC1-based machine learning (ML) is as powerful as quantum neural networks based on universal computation. Our findings highlight the potential of DQC1 as a practical and versatile platform for ML, capable of rivaling more complex quantum computing models while utilizing simpler quantum resources. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02751v1-abstract-full').style.display = 'none'; document.getElementById('2411.02751v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02278">arXiv:2411.02278</a> <span> [<a href="https://arxiv.org/pdf/2411.02278">pdf</a>, <a href="https://arxiv.org/format/2411.02278">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> Is This the Same Code? A Comprehensive Study of Decompilation Techniques for WebAssembly Binaries </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+W">Wei-Cheng Wu</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+Y">Yutian Yan</a>, <a href="/search/cs?searchtype=author&query=Egilsson%2C+H+D">Hallgrimur David Egilsson</a>, <a href="/search/cs?searchtype=author&query=Park%2C+D">David Park</a>, <a href="/search/cs?searchtype=author&query=Chan%2C+S">Steven Chan</a>, <a href="/search/cs?searchtype=author&query=Hauser%2C+C">Christophe Hauser</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Weihang Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02278v1-abstract-short" style="display: inline;"> WebAssembly is a low-level bytecode language designed for client-side execution in web browsers. The need for decompilation techniques that recover high-level source code from WASM binaries has grown as WASM continues to gain widespread adoption and its security concerns. However little research has been done to assess the quality of decompiled code from WASM. This paper aims to fill this gap by c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02278v1-abstract-full').style.display = 'inline'; document.getElementById('2411.02278v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02278v1-abstract-full" style="display: none;"> WebAssembly is a low-level bytecode language designed for client-side execution in web browsers. The need for decompilation techniques that recover high-level source code from WASM binaries has grown as WASM continues to gain widespread adoption and its security concerns. However little research has been done to assess the quality of decompiled code from WASM. This paper aims to fill this gap by conducting a comprehensive comparative analysis between decompiled C code from WASM binaries and state-of-the-art native binary decompilers. We presented a novel framework for empirically evaluating C-based decompilers from various aspects including correctness/ readability/ and structural similarity. The proposed metrics are validated practicality in decompiler assessment and provided insightful observations regarding the characteristics and constraints of existing decompiled code. This in turn contributes to bolstering the security and reliability of software systems that rely on WASM and native binaries. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02278v1-abstract-full').style.display = 'none'; document.getElementById('2411.02278v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">SecureComm'24: Proceedings of the 20th EAI International Conference on Security and Privacy in Communication Networks</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01996">arXiv:2411.01996</a> <span> [<a href="https://arxiv.org/pdf/2411.01996">pdf</a>, <a href="https://arxiv.org/format/2411.01996">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Culinary Class Wars: Evaluating LLMs using ASH in Cuisine Transfer Task </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lee%2C+H">Hoonick Lee</a>, <a href="/search/cs?searchtype=author&query=Gim%2C+M">Mogan Gim</a>, <a href="/search/cs?searchtype=author&query=Park%2C+D">Donghyeon Park</a>, <a href="/search/cs?searchtype=author&query=Choi%2C+D">Donghee Choi</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+J">Jaewoo Kang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01996v1-abstract-short" style="display: inline;"> The advent of Large Language Models (LLMs) have shown promise in various creative domains, including culinary arts. However, many LLMs still struggle to deliver the desired level of culinary creativity, especially when tasked with adapting recipes to meet specific cultural requirements. This study focuses on cuisine transfer-applying elements of one cuisine to another-to assess LLMs' culinary crea… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01996v1-abstract-full').style.display = 'inline'; document.getElementById('2411.01996v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01996v1-abstract-full" style="display: none;"> The advent of Large Language Models (LLMs) have shown promise in various creative domains, including culinary arts. However, many LLMs still struggle to deliver the desired level of culinary creativity, especially when tasked with adapting recipes to meet specific cultural requirements. This study focuses on cuisine transfer-applying elements of one cuisine to another-to assess LLMs' culinary creativity. We employ a diverse set of LLMs to generate and evaluate culturally adapted recipes, comparing their evaluations against LLM and human judgments. We introduce the ASH (authenticity, sensitivity, harmony) benchmark to evaluate LLMs' recipe generation abilities in the cuisine transfer task, assessing their cultural accuracy and creativity in the culinary domain. Our findings reveal crucial insights into both generative and evaluative capabilities of LLMs in the culinary domain, highlighting strengths and limitations in understanding and applying cultural nuances in recipe creation. The code and dataset used in this project will be openly available in \url{http://github.com/dmis-lab/CulinaryASH}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01996v1-abstract-full').style.display = 'none'; document.getElementById('2411.01996v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00322">arXiv:2411.00322</a> <span> [<a href="https://arxiv.org/pdf/2411.00322">pdf</a>, <a href="https://arxiv.org/format/2411.00322">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Constant Acceleration Flow </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Park%2C+D">Dogyun Park</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+S">Sojin Lee</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+S">Sihyeon Kim</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+T">Taehoon Lee</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+Y">Youngjoon Hong</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+H+J">Hyunwoo J. Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00322v1-abstract-short" style="display: inline;"> Rectified flow and reflow procedures have significantly advanced fast generation by progressively straightening ordinary differential equation (ODE) flows. They operate under the assumption that image and noise pairs, known as couplings, can be approximated by straight trajectories with constant velocity. However, we observe that modeling with constant velocity and using reflow procedures have lim… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00322v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00322v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00322v1-abstract-full" style="display: none;"> Rectified flow and reflow procedures have significantly advanced fast generation by progressively straightening ordinary differential equation (ODE) flows. They operate under the assumption that image and noise pairs, known as couplings, can be approximated by straight trajectories with constant velocity. However, we observe that modeling with constant velocity and using reflow procedures have limitations in accurately learning straight trajectories between pairs, resulting in suboptimal performance in few-step generation. To address these limitations, we introduce Constant Acceleration Flow (CAF), a novel framework based on a simple constant acceleration equation. CAF introduces acceleration as an additional learnable variable, allowing for more expressive and accurate estimation of the ODE flow. Moreover, we propose two techniques to further improve estimation accuracy: initial velocity conditioning for the acceleration model and a reflow process for the initial velocity. Our comprehensive studies on toy datasets, CIFAR-10, and ImageNet 64x64 demonstrate that CAF outperforms state-of-the-art baselines for one-step generation. We also show that CAF dramatically improves few-step coupling preservation and inversion over Rectified flow. Code is available at \href{https://github.com/mlvlab/CAF}{https://github.com/mlvlab/CAF}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00322v1-abstract-full').style.display = 'none'; document.getElementById('2411.00322v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.22376">arXiv:2410.22376</a> <span> [<a href="https://arxiv.org/pdf/2410.22376">pdf</a>, <a href="https://arxiv.org/format/2410.22376">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Rare-to-Frequent: Unlocking Compositional Generation Power of Diffusion Models on Rare Concepts with LLM Guidance </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Park%2C+D">Dongmin Park</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+S">Sebin Kim</a>, <a href="/search/cs?searchtype=author&query=Moon%2C+T">Taehong Moon</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+M">Minkyu Kim</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+K">Kangwook Lee</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+J">Jaewoong Cho</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.22376v2-abstract-short" style="display: inline;"> State-of-the-art text-to-image (T2I) diffusion models often struggle to generate rare compositions of concepts, e.g., objects with unusual attributes. In this paper, we show that the compositional generation power of diffusion models on such rare concepts can be significantly enhanced by the Large Language Model (LLM) guidance. We start with empirical and theoretical analysis, demonstrating that e… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22376v2-abstract-full').style.display = 'inline'; document.getElementById('2410.22376v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.22376v2-abstract-full" style="display: none;"> State-of-the-art text-to-image (T2I) diffusion models often struggle to generate rare compositions of concepts, e.g., objects with unusual attributes. In this paper, we show that the compositional generation power of diffusion models on such rare concepts can be significantly enhanced by the Large Language Model (LLM) guidance. We start with empirical and theoretical analysis, demonstrating that exposing frequent concepts relevant to the target rare concepts during the diffusion sampling process yields more accurate concept composition. Based on this, we propose a training-free approach, R2F, that plans and executes the overall rare-to-frequent concept guidance throughout the diffusion inference by leveraging the abundant semantic knowledge in LLMs. Our framework is flexible across any pre-trained diffusion models and LLMs, and can be seamlessly integrated with the region-guided diffusion approaches. Extensive experiments on three datasets, including our newly proposed benchmark, RareBench, containing various prompts with rare compositions of concepts, R2F significantly surpasses existing models including SD3.0 and FLUX by up to 28.1%p in T2I alignment. Code is available at https://github.com/krafton-ai/Rare-to-Frequent. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22376v2-abstract-full').style.display = 'none'; document.getElementById('2410.22376v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.07940">arXiv:2410.07940</a> <span> [<a href="https://arxiv.org/pdf/2410.07940">pdf</a>, <a href="https://arxiv.org/format/2410.07940">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> AI Surrogate Model for Distributed Computing Workloads </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Park%2C+D+K">David K. Park</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+Y">Yihui Ren</a>, <a href="/search/cs?searchtype=author&query=Kilic%2C+O+O">Ozgur O. Kilic</a>, <a href="/search/cs?searchtype=author&query=Korchuganova%2C+T">Tatiana Korchuganova</a>, <a href="/search/cs?searchtype=author&query=Vatsavai%2C+S+S">Sairam Sri Vatsavai</a>, <a href="/search/cs?searchtype=author&query=Boudreau%2C+J">Joseph Boudreau</a>, <a href="/search/cs?searchtype=author&query=Chowdhury%2C+T">Tasnuva Chowdhury</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+S">Shengyu Feng</a>, <a href="/search/cs?searchtype=author&query=Khan%2C+R">Raees Khan</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+J">Jaehyung Kim</a>, <a href="/search/cs?searchtype=author&query=Klasky%2C+S">Scott Klasky</a>, <a href="/search/cs?searchtype=author&query=Maeno%2C+T">Tadashi Maeno</a>, <a href="/search/cs?searchtype=author&query=Nilsson%2C+P">Paul Nilsson</a>, <a href="/search/cs?searchtype=author&query=Outschoorn%2C+V+I+M">Verena Ingrid Martinez Outschoorn</a>, <a href="/search/cs?searchtype=author&query=Podhorszki%2C+N">Norbert Podhorszki</a>, <a href="/search/cs?searchtype=author&query=Suter%2C+F">Frederic Suter</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+W">Wei Yang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yiming Yang</a>, <a href="/search/cs?searchtype=author&query=Yoo%2C+S">Shinjae Yoo</a>, <a href="/search/cs?searchtype=author&query=Klimentov%2C+A">Alexei Klimentov</a>, <a href="/search/cs?searchtype=author&query=Hoisie%2C+A">Adolfy Hoisie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.07940v1-abstract-short" style="display: inline;"> Large-scale international scientific collaborations, such as ATLAS, Belle II, CMS, and DUNE, generate vast volumes of data. These experiments necessitate substantial computational power for varied tasks, including structured data processing, Monte Carlo simulations, and end-user analysis. Centralized workflow and data management systems are employed to handle these demands, but current decision-ma… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07940v1-abstract-full').style.display = 'inline'; document.getElementById('2410.07940v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.07940v1-abstract-full" style="display: none;"> Large-scale international scientific collaborations, such as ATLAS, Belle II, CMS, and DUNE, generate vast volumes of data. These experiments necessitate substantial computational power for varied tasks, including structured data processing, Monte Carlo simulations, and end-user analysis. Centralized workflow and data management systems are employed to handle these demands, but current decision-making processes for data placement and payload allocation are often heuristic and disjointed. This optimization challenge potentially could be addressed using contemporary machine learning methods, such as reinforcement learning, which, in turn, require access to extensive data and an interactive environment. Instead, we propose a generative surrogate modeling approach to address the lack of training data and concerns about privacy preservation. We have collected and processed real-world job submission records, totaling more than two million jobs through 150 days, and applied four generative models for tabular data -- TVAE, CTAGGAN+, SMOTE, and TabDDPM -- to these datasets, thoroughly evaluating their performance. Along with measuring the discrepancy among feature-wise distributions separately, we also evaluate pair-wise feature correlations, distance to closest record, and responses to pre-trained models. Our experiments indicate that SMOTE and TabDDPM can generate similar tabular data, almost indistinguishable from the ground truth. Yet, as a non-learning method, SMOTE ranks the lowest in privacy preservation. As a result, we conclude that the probabilistic-diffusion-model-based TabDDPM is the most suitable generative model for managing job record data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07940v1-abstract-full').style.display = 'none'; document.getElementById('2410.07940v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 5 figures, to be presented in SC24 AI4S Workshop</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.06134">arXiv:2410.06134</a> <span> [<a href="https://arxiv.org/pdf/2410.06134">pdf</a>, <a href="https://arxiv.org/format/2410.06134">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Adaptive Label Smoothing for Out-of-Distribution Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+M">Mingle Xu</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+J">Jaehwan Lee</a>, <a href="/search/cs?searchtype=author&query=Yoon%2C+S">Sook Yoon</a>, <a href="/search/cs?searchtype=author&query=Park%2C+D+S">Dong Sun Park</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.06134v1-abstract-short" style="display: inline;"> Out-of-distribution (OOD) detection, which aims to distinguish unknown classes from known classes, has received increasing attention recently. A main challenge within is the unavailable of samples from the unknown classes in the training process, and an effective strategy is to improve the performance for known classes. Using beneficial strategies such as data augmentation and longer training is t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06134v1-abstract-full').style.display = 'inline'; document.getElementById('2410.06134v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.06134v1-abstract-full" style="display: none;"> Out-of-distribution (OOD) detection, which aims to distinguish unknown classes from known classes, has received increasing attention recently. A main challenge within is the unavailable of samples from the unknown classes in the training process, and an effective strategy is to improve the performance for known classes. Using beneficial strategies such as data augmentation and longer training is thus a way to improve OOD detection. However, label smoothing, an effective method for classifying known classes, degrades the performance of OOD detection, and this phenomenon is under exploration. In this paper, we first analyze that the limited and predefined learning target in label smoothing results in the smaller maximal probability and logit, which further leads to worse OOD detection performance. To mitigate this issue, we then propose a novel regularization method, called adaptive label smoothing (ALS), and the core is to push the non-true classes to have same probabilities whereas the maximal probability is neither fixed nor limited. Extensive experimental results in six datasets with two backbones suggest that ALS contributes to classifying known samples and discerning unknown samples with clear margins. Our code will be available to the public. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06134v1-abstract-full').style.display = 'none'; document.getElementById('2410.06134v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.17885">arXiv:2409.17885</a> <span> [<a href="https://arxiv.org/pdf/2409.17885">pdf</a>, <a href="https://arxiv.org/format/2409.17885">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> Sentiment Analysis of ML Projects: Bridging Emotional Intelligence and Code Quality </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ahmed%2C+M+S">Md Shoaib Ahmed</a>, <a href="/search/cs?searchtype=author&query=Park%2C+D">Dongyoung Park</a>, <a href="/search/cs?searchtype=author&query=Eisty%2C+N+U">Nasir U. Eisty</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.17885v1-abstract-short" style="display: inline;"> This study explores the intricate relationship between sentiment analysis (SA) and code quality within machine learning (ML) projects, illustrating how the emotional dynamics of developers affect the technical and functional attributes of software projects. Recognizing the vital role of developer sentiments, this research employs advanced sentiment analysis techniques to scrutinize affective state… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.17885v1-abstract-full').style.display = 'inline'; document.getElementById('2409.17885v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.17885v1-abstract-full" style="display: none;"> This study explores the intricate relationship between sentiment analysis (SA) and code quality within machine learning (ML) projects, illustrating how the emotional dynamics of developers affect the technical and functional attributes of software projects. Recognizing the vital role of developer sentiments, this research employs advanced sentiment analysis techniques to scrutinize affective states from textual interactions such as code comments, commit messages, and issue discussions within high-profile ML projects. By integrating a comprehensive dataset of popular ML repositories, this analysis applies a blend of rule-based, machine learning, and hybrid sentiment analysis methodologies to systematically quantify sentiment scores. The emotional valence expressed by developers is then correlated with a spectrum of code quality indicators, including the prevalence of bugs, vulnerabilities, security hotspots, code smells, and duplication instances. Findings from this study distinctly illustrate that positive sentiments among developers are strongly associated with superior code quality metrics manifested through reduced bugs and lower incidence of code smells. This relationship underscores the importance of fostering positive emotional environments to enhance productivity and code craftsmanship. Conversely, the analysis reveals that negative sentiments correlate with an uptick in code issues, particularly increased duplication and heightened security risks, pointing to the detrimental effects of adverse emotional conditions on project health. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.17885v1-abstract-full').style.display = 'none'; document.getElementById('2409.17885v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.15486">arXiv:2409.15486</a> <span> [<a href="https://arxiv.org/pdf/2409.15486">pdf</a>, <a href="https://arxiv.org/format/2409.15486">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> VLMine: Long-Tail Data Mining with Vision Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ye%2C+M">Mao Ye</a>, <a href="/search/cs?searchtype=author&query=Meyer%2C+G+P">Gregory P. Meyer</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zaiwei Zhang</a>, <a href="/search/cs?searchtype=author&query=Park%2C+D">Dennis Park</a>, <a href="/search/cs?searchtype=author&query=Mustikovela%2C+S+K">Siva Karthik Mustikovela</a>, <a href="/search/cs?searchtype=author&query=Chai%2C+Y">Yuning Chai</a>, <a href="/search/cs?searchtype=author&query=Wolff%2C+E+M">Eric M Wolff</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.15486v1-abstract-short" style="display: inline;"> Ensuring robust performance on long-tail examples is an important problem for many real-world applications of machine learning, such as autonomous driving. This work focuses on the problem of identifying rare examples within a corpus of unlabeled data. We propose a simple and scalable data mining approach that leverages the knowledge contained within a large vision language model (VLM). Our approa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15486v1-abstract-full').style.display = 'inline'; document.getElementById('2409.15486v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.15486v1-abstract-full" style="display: none;"> Ensuring robust performance on long-tail examples is an important problem for many real-world applications of machine learning, such as autonomous driving. This work focuses on the problem of identifying rare examples within a corpus of unlabeled data. We propose a simple and scalable data mining approach that leverages the knowledge contained within a large vision language model (VLM). Our approach utilizes a VLM to summarize the content of an image into a set of keywords, and we identify rare examples based on keyword frequency. We find that the VLM offers a distinct signal for identifying long-tail examples when compared to conventional methods based on model uncertainty. Therefore, we propose a simple and general approach for integrating signals from multiple mining algorithms. We evaluate the proposed method on two diverse tasks: 2D image classification, in which inter-class variation is the primary source of data diversity, and on 3D object detection, where intra-class variation is the main concern. Furthermore, through the detection task, we demonstrate that the knowledge extracted from 2D images is transferable to the 3D domain. Our experiments consistently show large improvements (between 10\% and 50\%) over the baseline techniques on several representative benchmarks: ImageNet-LT, Places-LT, and the Waymo Open Dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15486v1-abstract-full').style.display = 'none'; document.getElementById('2409.15486v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.14719">arXiv:2409.14719</a> <span> [<a href="https://arxiv.org/pdf/2409.14719">pdf</a>, <a href="https://arxiv.org/format/2409.14719">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> DiSPo: Diffusion-SSM based Policy Learning for Coarse-to-Fine Action Discretization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Oh%2C+N">Nayoung Oh</a>, <a href="/search/cs?searchtype=author&query=Jung%2C+M">Moonkyeong Jung</a>, <a href="/search/cs?searchtype=author&query=Park%2C+D">Daehyung Park</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.14719v1-abstract-short" style="display: inline;"> We aim to solve the problem of generating coarse-to-fine skills learning from demonstrations (LfD). To scale precision, traditional LfD approaches often rely on extensive fine-grained demonstrations with external interpolations or dynamics models with limited generalization capabilities. For memory-efficient learning and convenient granularity change, we propose a novel diffusion-SSM based policy… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.14719v1-abstract-full').style.display = 'inline'; document.getElementById('2409.14719v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.14719v1-abstract-full" style="display: none;"> We aim to solve the problem of generating coarse-to-fine skills learning from demonstrations (LfD). To scale precision, traditional LfD approaches often rely on extensive fine-grained demonstrations with external interpolations or dynamics models with limited generalization capabilities. For memory-efficient learning and convenient granularity change, we propose a novel diffusion-SSM based policy (DiSPo) that learns from diverse coarse skills and produces varying control scales of actions by leveraging a state-space model, Mamba. Our evaluations show the adoption of Mamba and the proposed step-scaling method enables DiSPo to outperform in five coarse-to-fine benchmark tests while DiSPo shows decent performance in typical fine-grained motion learning and reproduction. We finally demonstrate the scalability of actions with simulation and real-world manipulation tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.14719v1-abstract-full').style.display = 'none'; document.getElementById('2409.14719v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.09330">arXiv:2409.09330</a> <span> [<a href="https://arxiv.org/pdf/2409.09330">pdf</a>, <a href="https://arxiv.org/format/2409.09330">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/TCCN.2024.3435909">10.1109/TCCN.2024.3435909 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> VOMTC: Vision Objects for Millimeter and Terahertz Communications </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kim%2C+S">Sunwoo Kim</a>, <a href="/search/cs?searchtype=author&query=Ahn%2C+Y">Yongjun Ahn</a>, <a href="/search/cs?searchtype=author&query=Park%2C+D">Daeyoung Park</a>, <a href="/search/cs?searchtype=author&query=Shim%2C+B">Byonghyo Shim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.09330v1-abstract-short" style="display: inline;"> Recent advances in sensing and computer vision (CV) technologies have opened the door for the application of deep learning (DL)-based CV technologies in the realm of 6G wireless communications. For the successful application of this emerging technology, it is crucial to have a qualified vision dataset tailored for wireless applications (e.g., RGB images containing wireless devices such as laptops… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09330v1-abstract-full').style.display = 'inline'; document.getElementById('2409.09330v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.09330v1-abstract-full" style="display: none;"> Recent advances in sensing and computer vision (CV) technologies have opened the door for the application of deep learning (DL)-based CV technologies in the realm of 6G wireless communications. For the successful application of this emerging technology, it is crucial to have a qualified vision dataset tailored for wireless applications (e.g., RGB images containing wireless devices such as laptops and cell phones). An aim of this paper is to propose a large-scale vision dataset referred to as Vision Objects for Millimeter and Terahertz Communications (VOMTC). The VOMTC dataset consists of 20,232 pairs of RGB and depth images obtained from a camera attached to the base station (BS), with each pair labeled with three representative object categories (person, cell phone, and laptop) and bounding boxes of the objects. Through experimental studies of the VOMTC datasets, we show that the beamforming technique exploiting the VOMTC-trained object detector outperforms conventional beamforming techniques. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09330v1-abstract-full').style.display = 'none'; document.getElementById('2409.09330v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> IEEE Transactions on Cognitive Communications and Networking, 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.06993">arXiv:2409.06993</a> <span> [<a href="https://arxiv.org/pdf/2409.06993">pdf</a>, <a href="https://arxiv.org/format/2409.06993">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> RICAU-Net: Residual-block Inspired Coordinate Attention U-Net for Segmentation of Small and Sparse Calcium Lesions in Cardiac CT </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Park%2C+D">Doyoung Park</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+J">Jinsoo Kim</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+Q">Qi Chang</a>, <a href="/search/cs?searchtype=author&query=Leng%2C+S">Shuang Leng</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+L">Liang Zhong</a>, <a href="/search/cs?searchtype=author&query=Baskaran%2C+L">Lohendran Baskaran</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.06993v3-abstract-short" style="display: inline;"> The Agatston score, which is the sum of the calcification in the four main coronary arteries, has been widely used in the diagnosis of coronary artery disease (CAD). However, many studies have emphasized the importance of the vessel-specific Agatston score, as calcification in a specific vessel is significantly correlated with the occurrence of coronary heart disease (CHD). In this paper, we propo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.06993v3-abstract-full').style.display = 'inline'; document.getElementById('2409.06993v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.06993v3-abstract-full" style="display: none;"> The Agatston score, which is the sum of the calcification in the four main coronary arteries, has been widely used in the diagnosis of coronary artery disease (CAD). However, many studies have emphasized the importance of the vessel-specific Agatston score, as calcification in a specific vessel is significantly correlated with the occurrence of coronary heart disease (CHD). In this paper, we propose the Residual-block Inspired Coordinate Attention U-Net (RICAU-Net), which incorporates coordinate attention in two distinct manners and a customized combo loss function for lesion-specific coronary artery calcium (CAC) segmentation. This approach aims to tackle the high class-imbalance issue associated with small and sparse CAC lesions. Experimental results and the ablation study demonstrate that the proposed method outperforms the five other U-Net based methods used in medical applications, by achieving the highest per-lesion Dice scores across all four lesions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.06993v3-abstract-full').style.display = 'none'; document.getElementById('2409.06993v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by IEEE ISBI 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.05902">arXiv:2409.05902</a> <span> [<a href="https://arxiv.org/pdf/2409.05902">pdf</a>, <a href="https://arxiv.org/ps/2409.05902">ps</a>, <a href="https://arxiv.org/format/2409.05902">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> OPAL: Outlier-Preserved Microscaling Quantization Accelerator for Generative Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Koo%2C+J">Jahyun Koo</a>, <a href="/search/cs?searchtype=author&query=Park%2C+D">Dahoon Park</a>, <a href="/search/cs?searchtype=author&query=Jung%2C+S">Sangwoo Jung</a>, <a href="/search/cs?searchtype=author&query=Kung%2C+J">Jaeha Kung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.05902v3-abstract-short" style="display: inline;"> To overcome the burden on the memory size and bandwidth due to ever-increasing size of large language models (LLMs), aggressive weight quantization has been recently studied, while lacking research on quantizing activations. In this paper, we present a hardware-software co-design method that results in an energy-efficient LLM accelerator, named OPAL, for generation tasks. First of all, a novel act… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.05902v3-abstract-full').style.display = 'inline'; document.getElementById('2409.05902v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.05902v3-abstract-full" style="display: none;"> To overcome the burden on the memory size and bandwidth due to ever-increasing size of large language models (LLMs), aggressive weight quantization has been recently studied, while lacking research on quantizing activations. In this paper, we present a hardware-software co-design method that results in an energy-efficient LLM accelerator, named OPAL, for generation tasks. First of all, a novel activation quantization method that leverages the microscaling data format while preserving several outliers per sub-tensor block (e.g., four out of 128 elements) is proposed. Second, on top of preserving outliers, mixed precision is utilized that sets 5-bit for inputs to sensitive layers in the decoder block of an LLM, while keeping inputs to less sensitive layers to 3-bit. Finally, we present the OPAL hardware architecture that consists of FP units for handling outliers and vectorized INT multipliers for dominant non-outlier related operations. In addition, OPAL uses log2-based approximation on softmax operations that only requires shift and subtraction to maximize power efficiency. As a result, we are able to improve the energy efficiency by 1.6~2.2x, and reduce the area by 2.4~3.1x with negligible accuracy loss, i.e., <1 perplexity increase. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.05902v3-abstract-full').style.display = 'none'; document.getElementById('2409.05902v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages, 8 figures, DAC2024 accepted</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.05917">arXiv:2408.05917</a> <span> [<a href="https://arxiv.org/pdf/2408.05917">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computational Engineering, Finance, and Science">cs.CE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Inverse design of Non-parameterized Ventilated Acoustic Resonator via Variational Autoencoder with Acoustic Response-encoded Latent Space </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cho%2C+M+W">Min Woo Cho</a>, <a href="/search/cs?searchtype=author&query=Hwang%2C+S+H">Seok Hyeon Hwang</a>, <a href="/search/cs?searchtype=author&query=Jang%2C+J">Jun-Young Jang</a>, <a href="/search/cs?searchtype=author&query=Song%2C+J+Y">Jin Yeong Song</a>, <a href="/search/cs?searchtype=author&query=Hwang%2C+S">Sun-kwang Hwang</a>, <a href="/search/cs?searchtype=author&query=Cha%2C+K+J">Kyoung Je Cha</a>, <a href="/search/cs?searchtype=author&query=Park%2C+D+Y">Dong Yong Park</a>, <a href="/search/cs?searchtype=author&query=Song%2C+K">Kyungjun Song</a>, <a href="/search/cs?searchtype=author&query=Park%2C+S+M">Sang Min Park</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.05917v1-abstract-short" style="display: inline;"> Ventilated acoustic resonator(VAR), a type of acoustic metamaterial, emerge as an alternative for sound attenuation in environments that require ventilation, owing to its excellent low-frequency attenuation performance and flexible shape adaptability. However, due to the non-linear acoustic responses of VARs, the VAR designs are generally obtained within a limited parametrized design space, and th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.05917v1-abstract-full').style.display = 'inline'; document.getElementById('2408.05917v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.05917v1-abstract-full" style="display: none;"> Ventilated acoustic resonator(VAR), a type of acoustic metamaterial, emerge as an alternative for sound attenuation in environments that require ventilation, owing to its excellent low-frequency attenuation performance and flexible shape adaptability. However, due to the non-linear acoustic responses of VARs, the VAR designs are generally obtained within a limited parametrized design space, and the design relies on the iteration of the numerical simulation which consumes a considerable amount of computational time and resources. This paper proposes an acoustic response-encoded variational autoencoder (AR-VAE), a novel variational autoencoder-based generative design model for the efficient and accurate inverse design of VAR even with non-parametrized designs. The AR-VAE matches the high-dimensional acoustic response with the VAR cross-section image in the dimension-reduced latent space, which enables the AR-VAE to generate various non-parametrized VAR cross-section images with the target acoustic response. AR-VAE generates non-parameterized VARs from target acoustic responses, which show a 25-fold reduction in mean squared error compared to conventional deep learning-based parameter searching methods while exhibiting lower average mean squared error and peak frequency variance. By combining the inverse-designed VARs by AR-VAE, multi-cavity VAR was devised for broadband and multitarget peak frequency attenuation. The proposed design method presents a new approach for structural inverse-design with a high-dimensional non-linear physical response. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.05917v1-abstract-full').style.display = 'none'; document.getElementById('2408.05917v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.01099">arXiv:2408.01099</a> <span> [<a href="https://arxiv.org/pdf/2408.01099">pdf</a>, <a href="https://arxiv.org/format/2408.01099">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Contribution-based Low-Rank Adaptation with Pre-training Model for Real Image Restoration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Park%2C+D">Donwon Park</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+H">Hayeon Kim</a>, <a href="/search/cs?searchtype=author&query=Chun%2C+S+Y">Se Young Chun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.01099v1-abstract-short" style="display: inline;"> Recently, pre-trained model and efficient parameter tuning have achieved remarkable success in natural language processing and high-level computer vision with the aid of masked modeling and prompt tuning. In low-level computer vision, however, there have been limited investigations on pre-trained models and even efficient fine-tuning strategy has not yet been explored despite its importance and be… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.01099v1-abstract-full').style.display = 'inline'; document.getElementById('2408.01099v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.01099v1-abstract-full" style="display: none;"> Recently, pre-trained model and efficient parameter tuning have achieved remarkable success in natural language processing and high-level computer vision with the aid of masked modeling and prompt tuning. In low-level computer vision, however, there have been limited investigations on pre-trained models and even efficient fine-tuning strategy has not yet been explored despite its importance and benefit in various real-world tasks such as alleviating memory inflation issue when integrating new tasks on AI edge devices. Here, we propose a novel efficient parameter tuning approach dubbed contribution-based low-rank adaptation (CoLoRA) for multiple image restorations along with effective pre-training method with random order degradations (PROD). Unlike prior arts that tune all network parameters, our CoLoRA effectively fine-tunes small amount of parameters by leveraging LoRA (low-rank adaptation) for each new vision task with our contribution-based method to adaptively determine layer by layer capacity for that task to yield comparable performance to full tuning. Furthermore, our PROD strategy allows to extend the capability of pre-trained models with improved performance as well as robustness to bridge synthetic pre-training and real-world fine-tuning. Our CoLoRA with PROD has demonstrated its superior performance in various image restoration tasks across diverse degradation types on both synthetic and real-world datasets for known and novel tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.01099v1-abstract-full').style.display = 'none'; document.getElementById('2408.01099v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">33 pages, 15 figures, for homepage see this url : https://janeyeon.github.io/colora/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.16125">arXiv:2407.16125</a> <span> [<a href="https://arxiv.org/pdf/2407.16125">pdf</a>, <a href="https://arxiv.org/format/2407.16125">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Diffusion Prior-Based Amortized Variational Inference for Noisy Inverse Problems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lee%2C+S">Sojin Lee</a>, <a href="/search/cs?searchtype=author&query=Park%2C+D">Dogyun Park</a>, <a href="/search/cs?searchtype=author&query=Kong%2C+I">Inho Kong</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+H+J">Hyunwoo J. Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.16125v1-abstract-short" style="display: inline;"> Recent studies on inverse problems have proposed posterior samplers that leverage the pre-trained diffusion models as powerful priors. These attempts have paved the way for using diffusion models in a wide range of inverse problems. However, the existing methods entail computationally demanding iterative sampling procedures and optimize a separate solution for each measurement, which leads to limi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.16125v1-abstract-full').style.display = 'inline'; document.getElementById('2407.16125v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.16125v1-abstract-full" style="display: none;"> Recent studies on inverse problems have proposed posterior samplers that leverage the pre-trained diffusion models as powerful priors. These attempts have paved the way for using diffusion models in a wide range of inverse problems. However, the existing methods entail computationally demanding iterative sampling procedures and optimize a separate solution for each measurement, which leads to limited scalability and lack of generalization capability across unseen samples. To address these limitations, we propose a novel approach, Diffusion prior-based Amortized Variational Inference (DAVI) that solves inverse problems with a diffusion prior from an amortized variational inference perspective. Specifically, instead of separate measurement-wise optimization, our amortized inference learns a function that directly maps measurements to the implicit posterior distributions of corresponding clean data, enabling a single-step posterior sampling even for unseen measurements. Extensive experiments on image restoration tasks, e.g., Gaussian deblur, 4$\times$ super-resolution, and box inpainting with two benchmark datasets, demonstrate our approach's superior performance over strong baselines. Code is available at https://github.com/mlvlab/DAVI. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.16125v1-abstract-full').style.display = 'none'; document.getElementById('2407.16125v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ECCV 2024; 41 pages, 19 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.16896">arXiv:2406.16896</a> <span> [<a href="https://arxiv.org/pdf/2406.16896">pdf</a>, <a href="https://arxiv.org/format/2406.16896">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> f-GAN: A frequency-domain-constrained generative adversarial network for PPG to ECG synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kong%2C+N+C+L">Nathan C. L. Kong</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+D">Dae Lee</a>, <a href="/search/cs?searchtype=author&query=Do%2C+H">Huyen Do</a>, <a href="/search/cs?searchtype=author&query=Park%2C+D+H">Dae Hoon Park</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+C">Cong Xu</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+H">Hongda Mao</a>, <a href="/search/cs?searchtype=author&query=Chung%2C+J">Jonathan Chung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.16896v1-abstract-short" style="display: inline;"> Electrocardiograms (ECGs) and photoplethysmograms (PPGs) are generally used to monitor an individual's cardiovascular health. In clinical settings, ECGs and fingertip PPGs are the main signals used for assessing cardiovascular health, but the equipment necessary for their collection precludes their use in daily monitoring. Although PPGs obtained from wrist-worn devices are susceptible to noise due… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.16896v1-abstract-full').style.display = 'inline'; document.getElementById('2406.16896v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.16896v1-abstract-full" style="display: none;"> Electrocardiograms (ECGs) and photoplethysmograms (PPGs) are generally used to monitor an individual's cardiovascular health. In clinical settings, ECGs and fingertip PPGs are the main signals used for assessing cardiovascular health, but the equipment necessary for their collection precludes their use in daily monitoring. Although PPGs obtained from wrist-worn devices are susceptible to noise due to motion, they have been widely used to continuously monitor cardiovascular health because of their convenience. Therefore, we would like to combine the ease with which PPGs can be collected with the information that ECGs provide about cardiovascular health by developing models to synthesize ECG signals from paired PPG signals. We tackled this problem using generative adversarial networks (GANs) and found that models trained using the original GAN formulations can be successfully used to synthesize ECG signals from which heart rate can be extracted using standard signal processing pipelines. Incorporating a frequency-domain constraint to model training improved the stability of model performance and also the performance on heart rate estimation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.16896v1-abstract-full').style.display = 'none'; document.getElementById('2406.16896v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.07736">arXiv:2406.07736</a> <span> [<a href="https://arxiv.org/pdf/2406.07736">pdf</a>, <a href="https://arxiv.org/format/2406.07736">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> MultiPragEval: Multilingual Pragmatic Evaluation of Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Park%2C+D">Dojun Park</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+J">Jiwoo Lee</a>, <a href="/search/cs?searchtype=author&query=Park%2C+S">Seohyun Park</a>, <a href="/search/cs?searchtype=author&query=Jeong%2C+H">Hyeyun Jeong</a>, <a href="/search/cs?searchtype=author&query=Koo%2C+Y">Youngeun Koo</a>, <a href="/search/cs?searchtype=author&query=Hwang%2C+S">Soonha Hwang</a>, <a href="/search/cs?searchtype=author&query=Park%2C+S">Seonwoo Park</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+S">Sungeun Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.07736v3-abstract-short" style="display: inline;"> As the capabilities of Large Language Models (LLMs) expand, it becomes increasingly important to evaluate them beyond basic knowledge assessment, focusing on higher-level language understanding. This study introduces MultiPragEval, the first multilingual pragmatic evaluation of LLMs, designed for English, German, Korean, and Chinese. Comprising 1200 question units categorized according to Grice's… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.07736v3-abstract-full').style.display = 'inline'; document.getElementById('2406.07736v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.07736v3-abstract-full" style="display: none;"> As the capabilities of Large Language Models (LLMs) expand, it becomes increasingly important to evaluate them beyond basic knowledge assessment, focusing on higher-level language understanding. This study introduces MultiPragEval, the first multilingual pragmatic evaluation of LLMs, designed for English, German, Korean, and Chinese. Comprising 1200 question units categorized according to Grice's Cooperative Principle and its four conversational maxims, MultiPragEval enables an in-depth assessment of LLMs' contextual awareness and their ability to infer implied meanings. Our findings demonstrate that Claude3-Opus significantly outperforms other models in all tested languages, establishing a state-of-the-art in the field. Among open-source models, Solar-10.7B and Qwen1.5-14B emerge as strong competitors. By analyzing pragmatic inference, we provide valuable insights into the capabilities essential for advanced language comprehension in AI systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.07736v3-abstract-full').style.display = 'none'; document.getElementById('2406.07736v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The 2nd GenBench workshop on generalisation (benchmarking) in NLP</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.01554">arXiv:2405.01554</a> <span> [<a href="https://arxiv.org/pdf/2405.01554">pdf</a>, <a href="https://arxiv.org/format/2405.01554">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neurons and Cognition">q-bio.NC</span> </div> </div> <p class="title is-5 mathjax"> Early-stage detection of cognitive impairment by hybrid quantum-classical algorithm using resting-state functional MRI time-series </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Choi%2C+J">Junggu Choi</a>, <a href="/search/cs?searchtype=author&query=Hur%2C+T">Tak Hur</a>, <a href="/search/cs?searchtype=author&query=Park%2C+D+K">Daniel K. Park</a>, <a href="/search/cs?searchtype=author&query=Shin%2C+N">Na-Young Shin</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+S">Seung-Koo Lee</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+H">Hakbae Lee</a>, <a href="/search/cs?searchtype=author&query=Han%2C+S">Sanghoon Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.01554v1-abstract-short" style="display: inline;"> Following the recent development of quantum machine learning techniques, the literature has reported several quantum machine learning algorithms for disease detection. This study explores the application of a hybrid quantum-classical algorithm for classifying region-of-interest time-series data obtained from resting-state functional magnetic resonance imaging in patients with early-stage cognitive… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.01554v1-abstract-full').style.display = 'inline'; document.getElementById('2405.01554v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.01554v1-abstract-full" style="display: none;"> Following the recent development of quantum machine learning techniques, the literature has reported several quantum machine learning algorithms for disease detection. This study explores the application of a hybrid quantum-classical algorithm for classifying region-of-interest time-series data obtained from resting-state functional magnetic resonance imaging in patients with early-stage cognitive impairment based on the importance of cognitive decline for dementia or aging. Classical one-dimensional convolutional layers are used together with quantum convolutional neural networks in our hybrid algorithm. In the classical simulation, the proposed hybrid algorithms showed higher balanced accuracies than classical convolutional neural networks under the similar training conditions. Moreover, a total of nine brain regions (left precentral gyrus, right superior temporal gyrus, left rolandic operculum, right rolandic operculum, left parahippocampus, right hippocampus, left medial frontal gyrus, right cerebellum crus, and cerebellar vermis) among 116 brain regions were found to be relatively effective brain regions for the classification based on the model performances. The associations of the selected nine regions with cognitive decline, as found in previous studies, were additionally validated through seed-based functional connectivity analysis. We confirmed both the improvement of model performance with the quantum convolutional neural network and neuroscientific validities of brain regions from our hybrid quantum-classical model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.01554v1-abstract-full').style.display = 'none'; document.getElementById('2405.01554v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">28 pages, 10 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.00523">arXiv:2405.00523</a> <span> [<a href="https://arxiv.org/pdf/2405.00523">pdf</a>, <a href="https://arxiv.org/format/2405.00523">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> CookingSense: A Culinary Knowledgebase with Multidisciplinary Assertions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Choi%2C+D">Donghee Choi</a>, <a href="/search/cs?searchtype=author&query=Gim%2C+M">Mogan Gim</a>, <a href="/search/cs?searchtype=author&query=Park%2C+D">Donghyeon Park</a>, <a href="/search/cs?searchtype=author&query=Sung%2C+M">Mujeen Sung</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+H">Hyunjae Kim</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+J">Jaewoo Kang</a>, <a href="/search/cs?searchtype=author&query=Choi%2C+J">Jihun Choi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.00523v1-abstract-short" style="display: inline;"> This paper introduces CookingSense, a descriptive collection of knowledge assertions in the culinary domain extracted from various sources, including web data, scientific papers, and recipes, from which knowledge covering a broad range of aspects is acquired. CookingSense is constructed through a series of dictionary-based filtering and language model-based semantic filtering techniques, which res… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.00523v1-abstract-full').style.display = 'inline'; document.getElementById('2405.00523v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.00523v1-abstract-full" style="display: none;"> This paper introduces CookingSense, a descriptive collection of knowledge assertions in the culinary domain extracted from various sources, including web data, scientific papers, and recipes, from which knowledge covering a broad range of aspects is acquired. CookingSense is constructed through a series of dictionary-based filtering and language model-based semantic filtering techniques, which results in a rich knowledgebase of multidisciplinary food-related assertions. Additionally, we present FoodBench, a novel benchmark to evaluate culinary decision support systems. From evaluations with FoodBench, we empirically prove that CookingSense improves the performance of retrieval augmented language models. We also validate the quality and variety of assertions in CookingSense through qualitative analysis. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.00523v1-abstract-full').style.display = 'none'; document.getElementById('2405.00523v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">LREC-COLING 2024 Accepted</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Report number:</span> https://aclanthology.org/2024.lrec-main.354 </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> LREC-COLING 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.09228">arXiv:2404.09228</a> <span> [<a href="https://arxiv.org/pdf/2404.09228">pdf</a>, <a href="https://arxiv.org/format/2404.09228">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1007/s11370-024-00550-5">10.1007/s11370-024-00550-5 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> A Survey on Integration of Large Language Models with Intelligent Robots </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kim%2C+Y">Yeseung Kim</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+D">Dohyun Kim</a>, <a href="/search/cs?searchtype=author&query=Choi%2C+J">Jieun Choi</a>, <a href="/search/cs?searchtype=author&query=Park%2C+J">Jisang Park</a>, <a href="/search/cs?searchtype=author&query=Oh%2C+N">Nayoung Oh</a>, <a href="/search/cs?searchtype=author&query=Park%2C+D">Daehyung Park</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.09228v5-abstract-short" style="display: inline;"> In recent years, the integration of large language models (LLMs) has revolutionized the field of robotics, enabling robots to communicate, understand, and reason with human-like proficiency. This paper explores the multifaceted impact of LLMs on robotics, addressing key challenges and opportunities for leveraging these models across various domains. By categorizing and analyzing LLM applications w… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.09228v5-abstract-full').style.display = 'inline'; document.getElementById('2404.09228v5-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.09228v5-abstract-full" style="display: none;"> In recent years, the integration of large language models (LLMs) has revolutionized the field of robotics, enabling robots to communicate, understand, and reason with human-like proficiency. This paper explores the multifaceted impact of LLMs on robotics, addressing key challenges and opportunities for leveraging these models across various domains. By categorizing and analyzing LLM applications within core robotics elements -- communication, perception, planning, and control -- we aim to provide actionable insights for researchers seeking to integrate LLMs into their robotic systems. Our investigation focuses on LLMs developed post-GPT-3.5, primarily in text-based modalities while also considering multimodal approaches for perception and control. We offer comprehensive guidelines and examples for prompt engineering, facilitating beginners' access to LLM-based robotics solutions. Through tutorial-level examples and structured prompt construction, we illustrate how LLM-guided enhancements can be seamlessly integrated into robotics applications. This survey serves as a roadmap for researchers navigating the evolving landscape of LLM-driven robotics, offering a comprehensive overview and practical guidance for harnessing the power of language models in robotics development. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.09228v5-abstract-full').style.display = 'none'; document.getElementById('2404.09228v5-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">24 pages, 5 figures, Published in Intelligent Service Robotics (ISR)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.05218">arXiv:2404.05218</a> <span> [<a href="https://arxiv.org/pdf/2404.05218">pdf</a>, <a href="https://arxiv.org/format/2404.05218">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Multi-agent Long-term 3D Human Pose Forecasting via Interaction-aware Trajectory Conditioning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jeong%2C+J">Jaewoo Jeong</a>, <a href="/search/cs?searchtype=author&query=Park%2C+D">Daehee Park</a>, <a href="/search/cs?searchtype=author&query=Yoon%2C+K">Kuk-Jin Yoon</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.05218v1-abstract-short" style="display: inline;"> Human pose forecasting garners attention for its diverse applications. However, challenges in modeling the multi-modal nature of human motion and intricate interactions among agents persist, particularly with longer timescales and more agents. In this paper, we propose an interaction-aware trajectory-conditioned long-term multi-agent human pose forecasting model, utilizing a coarse-to-fine predict… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.05218v1-abstract-full').style.display = 'inline'; document.getElementById('2404.05218v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.05218v1-abstract-full" style="display: none;"> Human pose forecasting garners attention for its diverse applications. However, challenges in modeling the multi-modal nature of human motion and intricate interactions among agents persist, particularly with longer timescales and more agents. In this paper, we propose an interaction-aware trajectory-conditioned long-term multi-agent human pose forecasting model, utilizing a coarse-to-fine prediction approach: multi-modal global trajectories are initially forecasted, followed by respective local pose forecasts conditioned on each mode. In doing so, our Trajectory2Pose model introduces a graph-based agent-wise interaction module for a reciprocal forecast of local motion-conditioned global trajectory and trajectory-conditioned local pose. Our model effectively handles the multi-modality of human motion and the complexity of long-term multi-agent interactions, improving performance in complex environments. Furthermore, we address the lack of long-term (6s+) multi-agent (5+) datasets by constructing a new dataset from real-world images and 2D annotations, enabling a comprehensive evaluation of our proposed model. State-of-the-art prediction performance on both complex and simpler datasets confirms the generalized effectiveness of our method. The code is available at https://github.com/Jaewoo97/T2P. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.05218v1-abstract-full').style.display = 'none'; document.getElementById('2404.05218v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">2024 CVPR Highlight</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.01954">arXiv:2404.01954</a> <span> [<a href="https://arxiv.org/pdf/2404.01954">pdf</a>, <a href="https://arxiv.org/format/2404.01954">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> HyperCLOVA X Technical Report </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yoo%2C+K+M">Kang Min Yoo</a>, <a href="/search/cs?searchtype=author&query=Han%2C+J">Jaegeun Han</a>, <a href="/search/cs?searchtype=author&query=In%2C+S">Sookyo In</a>, <a href="/search/cs?searchtype=author&query=Jeon%2C+H">Heewon Jeon</a>, <a href="/search/cs?searchtype=author&query=Jeong%2C+J">Jisu Jeong</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+J">Jaewook Kang</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+H">Hyunwook Kim</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+K">Kyung-Min Kim</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+M">Munhyong Kim</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+S">Sungju Kim</a>, <a href="/search/cs?searchtype=author&query=Kwak%2C+D">Donghyun Kwak</a>, <a href="/search/cs?searchtype=author&query=Kwak%2C+H">Hanock Kwak</a>, <a href="/search/cs?searchtype=author&query=Kwon%2C+S+J">Se Jung Kwon</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+B">Bado Lee</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+D">Dongsoo Lee</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+G">Gichang Lee</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+J">Jooho Lee</a>, <a href="/search/cs?searchtype=author&query=Park%2C+B">Baeseong Park</a>, <a href="/search/cs?searchtype=author&query=Shin%2C+S">Seongjin Shin</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+J">Joonsang Yu</a>, <a href="/search/cs?searchtype=author&query=Baek%2C+S">Seolki Baek</a>, <a href="/search/cs?searchtype=author&query=Byeon%2C+S">Sumin Byeon</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+E">Eungsup Cho</a>, <a href="/search/cs?searchtype=author&query=Choe%2C+D">Dooseok Choe</a>, <a href="/search/cs?searchtype=author&query=Han%2C+J">Jeesung Han</a> , et al. (371 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.01954v2-abstract-short" style="display: inline;"> We introduce HyperCLOVA X, a family of large language models (LLMs) tailored to the Korean language and culture, along with competitive capabilities in English, math, and coding. HyperCLOVA X was trained on a balanced mix of Korean, English, and code data, followed by instruction-tuning with high-quality human-annotated datasets while abiding by strict safety guidelines reflecting our commitment t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.01954v2-abstract-full').style.display = 'inline'; document.getElementById('2404.01954v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.01954v2-abstract-full" style="display: none;"> We introduce HyperCLOVA X, a family of large language models (LLMs) tailored to the Korean language and culture, along with competitive capabilities in English, math, and coding. HyperCLOVA X was trained on a balanced mix of Korean, English, and code data, followed by instruction-tuning with high-quality human-annotated datasets while abiding by strict safety guidelines reflecting our commitment to responsible AI. The model is evaluated across various benchmarks, including comprehensive reasoning, knowledge, commonsense, factuality, coding, math, chatting, instruction-following, and harmlessness, in both Korean and English. HyperCLOVA X exhibits strong reasoning capabilities in Korean backed by a deep understanding of the language and cultural nuances. Further analysis of the inherent bilingual nature and its extension to multilingualism highlights the model's cross-lingual proficiency and strong generalization ability to untargeted languages, including machine translation between several language pairs and cross-lingual inference tasks. We believe that HyperCLOVA X can provide helpful guidance for regions or countries in developing their sovereign LLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.01954v2-abstract-full').style.display = 'none'; document.getElementById('2404.01954v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">44 pages; updated authors list and fixed author names</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.19099">arXiv:2403.19099</a> <span> [<a href="https://arxiv.org/pdf/2403.19099">pdf</a>, <a href="https://arxiv.org/format/2403.19099">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantum Physics">quant-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Optimizing Quantum Convolutional Neural Network Architectures for Arbitrary Data Dimension </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lee%2C+C">Changwon Lee</a>, <a href="/search/cs?searchtype=author&query=Araujo%2C+I+F">Israel F. Araujo</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+D">Dongha Kim</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+J">Junghan Lee</a>, <a href="/search/cs?searchtype=author&query=Park%2C+S">Siheon Park</a>, <a href="/search/cs?searchtype=author&query=Ryu%2C+J">Ju-Young Ryu</a>, <a href="/search/cs?searchtype=author&query=Park%2C+D+K">Daniel K. Park</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.19099v1-abstract-short" style="display: inline;"> Quantum convolutional neural networks (QCNNs) represent a promising approach in quantum machine learning, paving new directions for both quantum and classical data analysis. This approach is particularly attractive due to the absence of the barren plateau problem, a fundamental challenge in training quantum neural networks (QNNs), and its feasibility. However, a limitation arises when applying QCN… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.19099v1-abstract-full').style.display = 'inline'; document.getElementById('2403.19099v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.19099v1-abstract-full" style="display: none;"> Quantum convolutional neural networks (QCNNs) represent a promising approach in quantum machine learning, paving new directions for both quantum and classical data analysis. This approach is particularly attractive due to the absence of the barren plateau problem, a fundamental challenge in training quantum neural networks (QNNs), and its feasibility. However, a limitation arises when applying QCNNs to classical data. The network architecture is most natural when the number of input qubits is a power of two, as this number is reduced by a factor of two in each pooling layer. The number of input qubits determines the dimensions (i.e. the number of features) of the input data that can be processed, restricting the applicability of QCNN algorithms to real-world data. To address this issue, we propose a QCNN architecture capable of handling arbitrary input data dimensions while optimizing the allocation of quantum resources such as ancillary qubits and quantum gates. This optimization is not only important for minimizing computational resources, but also essential in noisy intermediate-scale quantum (NISQ) computing, as the size of the quantum circuits that can be executed reliably is limited. Through numerical simulations, we benchmarked the classification performance of various QCNN architectures when handling arbitrary input data dimensions on the MNIST and Breast Cancer datasets. The results validate that the proposed QCNN architecture achieves excellent classification performance while utilizing a minimal resource overhead, providing an optimal solution when reliable quantum computation is constrained by noise and imperfections. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.19099v1-abstract-full').style.display = 'none'; document.getElementById('2403.19099v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages, 7 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.14191">arXiv:2403.14191</a> <span> [<a href="https://arxiv.org/pdf/2403.14191">pdf</a>, <a href="https://arxiv.org/format/2403.14191">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1016/j.compbiomed.2024.108241">10.1016/j.compbiomed.2024.108241 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> PECI-Net: Bolus segmentation from video fluoroscopic swallowing study images using preprocessing ensemble and cascaded inference </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Park%2C+D">Dougho Park</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+Y">Younghun Kim</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+H">Harim Kang</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+J">Junmyeoung Lee</a>, <a href="/search/cs?searchtype=author&query=Choi%2C+J">Jinyoung Choi</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+T">Taeyeon Kim</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+S">Sangeok Lee</a>, <a href="/search/cs?searchtype=author&query=Son%2C+S">Seokil Son</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+M">Minsol Kim</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+I">Injung Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.14191v1-abstract-short" style="display: inline;"> Bolus segmentation is crucial for the automated detection of swallowing disorders in videofluoroscopic swallowing studies (VFSS). However, it is difficult for the model to accurately segment a bolus region in a VFSS image because VFSS images are translucent, have low contrast and unclear region boundaries, and lack color information. To overcome these challenges, we propose PECI-Net, a network arc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.14191v1-abstract-full').style.display = 'inline'; document.getElementById('2403.14191v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.14191v1-abstract-full" style="display: none;"> Bolus segmentation is crucial for the automated detection of swallowing disorders in videofluoroscopic swallowing studies (VFSS). However, it is difficult for the model to accurately segment a bolus region in a VFSS image because VFSS images are translucent, have low contrast and unclear region boundaries, and lack color information. To overcome these challenges, we propose PECI-Net, a network architecture for VFSS image analysis that combines two novel techniques: the preprocessing ensemble network (PEN) and the cascaded inference network (CIN). PEN enhances the sharpness and contrast of the VFSS image by combining multiple preprocessing algorithms in a learnable way. CIN reduces ambiguity in bolus segmentation by using context from other regions through cascaded inference. Moreover, CIN prevents undesirable side effects from unreliably segmented regions by referring to the context in an asymmetric way. In experiments, PECI-Net exhibited higher performance than four recently developed baseline models, outperforming TernausNet, the best among the baseline models, by 4.54\% and the widely used UNet by 10.83\%. The results of the ablation studies confirm that CIN and PEN are effective in improving bolus segmentation performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.14191v1-abstract-full').style.display = 'none'; document.getElementById('2403.14191v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">20 pages, 8 figures,</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Computers in Biology and Medicine (2024) </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.12675">arXiv:2403.12675</a> <span> [<a href="https://arxiv.org/pdf/2403.12675">pdf</a>, <a href="https://arxiv.org/format/2403.12675">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Pragmatic Competence Evaluation of Large Language Models for the Korean Language </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Park%2C+D">Dojun Park</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+J">Jiwoo Lee</a>, <a href="/search/cs?searchtype=author&query=Jeong%2C+H">Hyeyun Jeong</a>, <a href="/search/cs?searchtype=author&query=Park%2C+S">Seohyun Park</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+S">Sungeun Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.12675v2-abstract-short" style="display: inline;"> Benchmarks play a significant role in the current evaluation of Large Language Models (LLMs), yet they often overlook the models' abilities to capture the nuances of human language, primarily focusing on evaluating embedded knowledge and technical skills. To address this gap, our study evaluates how well LLMs understand context-dependent expressions from a pragmatic standpoint, specifically in Kor… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.12675v2-abstract-full').style.display = 'inline'; document.getElementById('2403.12675v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.12675v2-abstract-full" style="display: none;"> Benchmarks play a significant role in the current evaluation of Large Language Models (LLMs), yet they often overlook the models' abilities to capture the nuances of human language, primarily focusing on evaluating embedded knowledge and technical skills. To address this gap, our study evaluates how well LLMs understand context-dependent expressions from a pragmatic standpoint, specifically in Korean. We use both Multiple-Choice Questions (MCQs) for automatic evaluation and Open-Ended Questions (OEQs) assessed by human experts. Our results show that GPT-4 leads with scores of 81.11 in MCQs and 85.69 in OEQs, closely followed by HyperCLOVA X. Additionally, while few-shot learning generally improves performance, Chain-of-Thought (CoT) prompting tends to encourage literal interpretations, which may limit effective pragmatic inference. Our findings highlight the need for LLMs to better understand and generate language that reflects human communicative norms. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.12675v2-abstract-full').style.display = 'none'; document.getElementById('2403.12675v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">38th Pacific Asia Conference on Language, Information and Computation</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.12666">arXiv:2403.12666</a> <span> [<a href="https://arxiv.org/pdf/2403.12666">pdf</a>, <a href="https://arxiv.org/format/2403.12666">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Multi-Dimensional Machine Translation Evaluation: Model Evaluation and Resource for Korean </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Park%2C+D">Dojun Park</a>, <a href="/search/cs?searchtype=author&query=Pad%C3%B3%2C+S">Sebastian Pad贸</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.12666v1-abstract-short" style="display: inline;"> Almost all frameworks for the manual or automatic evaluation of machine translation characterize the quality of an MT output with a single number. An exception is the Multidimensional Quality Metrics (MQM) framework which offers a fine-grained ontology of quality dimensions for scoring (such as style, fluency, accuracy, and terminology). Previous studies have demonstrated the feasibility of MQM an… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.12666v1-abstract-full').style.display = 'inline'; document.getElementById('2403.12666v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.12666v1-abstract-full" style="display: none;"> Almost all frameworks for the manual or automatic evaluation of machine translation characterize the quality of an MT output with a single number. An exception is the Multidimensional Quality Metrics (MQM) framework which offers a fine-grained ontology of quality dimensions for scoring (such as style, fluency, accuracy, and terminology). Previous studies have demonstrated the feasibility of MQM annotation but there are, to our knowledge, no computational models that predict MQM scores for novel texts, due to a lack of resources. In this paper, we address these shortcomings by (a) providing a 1200-sentence MQM evaluation benchmark for the language pair English-Korean and (b) reframing MT evaluation as the multi-task problem of simultaneously predicting several MQM scores using SOTA language models, both in a reference-based MT evaluation setup and a reference-free quality estimation (QE) setup. We find that reference-free setup outperforms its counterpart in the style dimension while reference-based models retain an edge regarding accuracy. Overall, RemBERT emerges as the most promising model. Through our evaluation, we offer an insight into the translation quality in a more fine-grained, interpretable manner. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.12666v1-abstract-full').style.display = 'none'; document.getElementById('2403.12666v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages, accepted at LREC-COLING 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.12054">arXiv:2403.12054</a> <span> [<a href="https://arxiv.org/pdf/2403.12054">pdf</a>, <a href="https://arxiv.org/format/2403.12054">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Haze Removal via Regional Saturation-Value Translation and Soft Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tran%2C+L">Le-Anh Tran</a>, <a href="/search/cs?searchtype=author&query=Park%2C+D">Dong-Chul Park</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.12054v1-abstract-short" style="display: inline;"> This paper proposes a single image dehazing prior, called Regional Saturation-Value Translation (RSVT), to tackle the color distortion problems caused by conventional dehazing approaches in bright regions. The RSVT prior is developed based on two key observations regarding the relationship between hazy and haze-free points in the HSV color space. First, the hue component shows marginal variation b… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.12054v1-abstract-full').style.display = 'inline'; document.getElementById('2403.12054v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.12054v1-abstract-full" style="display: none;"> This paper proposes a single image dehazing prior, called Regional Saturation-Value Translation (RSVT), to tackle the color distortion problems caused by conventional dehazing approaches in bright regions. The RSVT prior is developed based on two key observations regarding the relationship between hazy and haze-free points in the HSV color space. First, the hue component shows marginal variation between corresponding hazy and haze-free points, consolidating a hypothesis that the pixel value variability induced by haze primarily occurs in the saturation and value spaces. Second, in the 2D saturation-value coordinate system, most lines passing through hazy-clean point pairs are likely to intersect near the atmospheric light coordinates. Accordingly, haze removal for the bright regions can be performed by properly translating saturation-value coordinates. In addition, an effective soft segmentation method based on a morphological min-max channel is introduced. By combining the soft segmentation mask with the RSVT prior, a comprehensive single image dehazing framework is devised. Experimental results on various synthetic and realistic hazy image datasets demonstrate that the proposed scheme successfully addresses color distortion issues and restores visually appealing images. The code of this work is available at https://github.com/tranleanh/rsvt. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.12054v1-abstract-full').style.display = 'none'; document.getElementById('2403.12054v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">14 pages, 16 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.12049">arXiv:2403.12049</a> <span> [<a href="https://arxiv.org/pdf/2403.12049">pdf</a>, <a href="https://arxiv.org/format/2403.12049">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Toward Improving Robustness of Object Detectors Against Domain Shift </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tran%2C+L">Le-Anh Tran</a>, <a href="/search/cs?searchtype=author&query=Tran%2C+C+N">Chung Nguyen Tran</a>, <a href="/search/cs?searchtype=author&query=Park%2C+D">Dong-Chul Park</a>, <a href="/search/cs?searchtype=author&query=Carrabina%2C+J">Jordi Carrabina</a>, <a href="/search/cs?searchtype=author&query=Castells-Rufas%2C+D">David Castells-Rufas</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.12049v1-abstract-short" style="display: inline;"> This paper proposes a data augmentation method for improving the robustness of driving object detectors against domain shift. Domain shift problem arises when there is a significant change between the distribution of the source data domain used in the training phase and that of the target data domain in the deployment phase. Domain shift is known as one of the most popular reasons resulting in the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.12049v1-abstract-full').style.display = 'inline'; document.getElementById('2403.12049v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.12049v1-abstract-full" style="display: none;"> This paper proposes a data augmentation method for improving the robustness of driving object detectors against domain shift. Domain shift problem arises when there is a significant change between the distribution of the source data domain used in the training phase and that of the target data domain in the deployment phase. Domain shift is known as one of the most popular reasons resulting in the considerable drop in the performance of deep neural network models. In order to address this problem, one effective approach is to increase the diversity of training data. To this end, we propose a data synthesis module that can be utilized to train more robust and effective object detectors. By adopting YOLOv4 as a base object detector, we have witnessed a remarkable improvement in performance on both the source and target domain data. The code of this work is publicly available at https://github.com/tranleanh/haze-synthesis. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.12049v1-abstract-full').style.display = 'none'; document.getElementById('2403.12049v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.10492">arXiv:2403.10492</a> <span> [<a href="https://arxiv.org/pdf/2403.10492">pdf</a>, <a href="https://arxiv.org/format/2403.10492">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Mitigating Dialogue Hallucination for Large Vision Language Models via Adversarial Instruction Tuning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Park%2C+D">Dongmin Park</a>, <a href="/search/cs?searchtype=author&query=Qian%2C+Z">Zhaofang Qian</a>, <a href="/search/cs?searchtype=author&query=Han%2C+G">Guangxing Han</a>, <a href="/search/cs?searchtype=author&query=Lim%2C+S">Ser-Nam Lim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.10492v3-abstract-short" style="display: inline;"> Mitigating hallucinations of Large Vision Language Models,(LVLMs) is crucial to enhance their reliability for general-purpose assistants. This paper shows that such hallucinations of LVLMs can be significantly exacerbated by preceding user-system dialogues. To precisely measure this, we first present an evaluation benchmark by extending popular multi-modal benchmark datasets with prepended halluci… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.10492v3-abstract-full').style.display = 'inline'; document.getElementById('2403.10492v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.10492v3-abstract-full" style="display: none;"> Mitigating hallucinations of Large Vision Language Models,(LVLMs) is crucial to enhance their reliability for general-purpose assistants. This paper shows that such hallucinations of LVLMs can be significantly exacerbated by preceding user-system dialogues. To precisely measure this, we first present an evaluation benchmark by extending popular multi-modal benchmark datasets with prepended hallucinatory dialogues powered by our novel Adversarial Question Generator (AQG), which can automatically generate image-related yet adversarial dialogues by adopting adversarial attacks on LVLMs. On our benchmark, the zero-shot performance of state-of-the-art LVLMs drops significantly for both the VQA and Captioning tasks. Next, we further reveal this hallucination is mainly due to the prediction bias toward preceding dialogues rather than visual content. To reduce this bias, we propose Adversarial Instruction Tuning (AIT) that robustly fine-tunes LVLMs against hallucinatory dialogues. Extensive experiments show our proposed approach successfully reduces dialogue hallucination while maintaining performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.10492v3-abstract-full').style.display = 'none'; document.getElementById('2403.10492v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.10052">arXiv:2403.10052</a> <span> [<a href="https://arxiv.org/pdf/2403.10052">pdf</a>, <a href="https://arxiv.org/format/2403.10052">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> T4P: Test-Time Training of Trajectory Prediction via Masked Autoencoder and Actor-specific Token Memory </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Park%2C+D">Daehee Park</a>, <a href="/search/cs?searchtype=author&query=Jeong%2C+J">Jaeseok Jeong</a>, <a href="/search/cs?searchtype=author&query=Yoon%2C+S">Sung-Hoon Yoon</a>, <a href="/search/cs?searchtype=author&query=Jeong%2C+J">Jaewoo Jeong</a>, <a href="/search/cs?searchtype=author&query=Yoon%2C+K">Kuk-Jin Yoon</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.10052v1-abstract-short" style="display: inline;"> Trajectory prediction is a challenging problem that requires considering interactions among multiple actors and the surrounding environment. While data-driven approaches have been used to address this complex problem, they suffer from unreliable predictions under distribution shifts during test time. Accordingly, several online learning methods have been proposed using regression loss from the gro… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.10052v1-abstract-full').style.display = 'inline'; document.getElementById('2403.10052v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.10052v1-abstract-full" style="display: none;"> Trajectory prediction is a challenging problem that requires considering interactions among multiple actors and the surrounding environment. While data-driven approaches have been used to address this complex problem, they suffer from unreliable predictions under distribution shifts during test time. Accordingly, several online learning methods have been proposed using regression loss from the ground truth of observed data leveraging the auto-labeling nature of trajectory prediction task. We mainly tackle the following two issues. First, previous works underfit and overfit as they only optimize the last layer of the motion decoder. To this end, we employ the masked autoencoder (MAE) for representation learning to encourage complex interaction modeling in shifted test distribution for updating deeper layers. Second, utilizing the sequential nature of driving data, we propose an actor-specific token memory that enables the test-time learning of actor-wise motion characteristics. Our proposed method has been validated across various challenging cross-dataset distribution shift scenarios including nuScenes, Lyft, Waymo, and Interaction. Our method surpasses the performance of existing state-of-the-art online learning methods in terms of both prediction accuracy and computational efficiency. The code is available at https://github.com/daeheepark/T4P. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.10052v1-abstract-full').style.display = 'none'; document.getElementById('2403.10052v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CVPR 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.00013">arXiv:2403.00013</a> <span> [<a href="https://arxiv.org/pdf/2403.00013">pdf</a>, <a href="https://arxiv.org/format/2403.00013">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Prioritizing Informative Features and Examples for Deep Learning from Noisy Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Park%2C+D">Dongmin Park</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.00013v2-abstract-short" style="display: inline;"> In this dissertation, we propose a systemic framework that prioritizes informative features and examples to enhance each stage of the development process. Specifically, we prioritize informative features and examples and improve the performance of feature learning, data labeling, and data selection. We first propose an approach to extract only informative features that are inherent to solving a ta… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.00013v2-abstract-full').style.display = 'inline'; document.getElementById('2403.00013v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.00013v2-abstract-full" style="display: none;"> In this dissertation, we propose a systemic framework that prioritizes informative features and examples to enhance each stage of the development process. Specifically, we prioritize informative features and examples and improve the performance of feature learning, data labeling, and data selection. We first propose an approach to extract only informative features that are inherent to solving a target task by using auxiliary out-of-distribution data. We deactivate the noise features in the target distribution by using that in the out-of-distribution data. Next, we introduce an approach that prioritizes informative examples from unlabeled noisy data in order to reduce the labeling cost of active learning. In order to solve the purity-information dilemma, where an attempt to select informative examples induces the selection of many noisy examples, we propose a meta-model that finds the best balance between purity and informativeness. Lastly, we suggest an approach that prioritizes informative examples from labeled noisy data to preserve the performance of data selection. For labeled image noise data, we propose a data selection method that considers the confidence of neighboring samples to maintain the performance of the state-of-the-art Re-labeling models. For labeled text noise data, we present an instruction selection method that takes diversity into account for ranking the quality of instructions with prompting, thereby enhancing the performance of aligned large language models. Overall, our unified framework induces the deep learning development process robust to noisy data, thereby effectively mitigating noisy features and examples in real-world applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.00013v2-abstract-full').style.display = 'none'; document.getElementById('2403.00013v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">PhD thesis</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.16506">arXiv:2402.16506</a> <span> [<a href="https://arxiv.org/pdf/2402.16506">pdf</a>, <a href="https://arxiv.org/format/2402.16506">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Stochastic Conditional Diffusion Models for Robust Semantic Image Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ko%2C+J">Juyeon Ko</a>, <a href="/search/cs?searchtype=author&query=Kong%2C+I">Inho Kong</a>, <a href="/search/cs?searchtype=author&query=Park%2C+D">Dogyun Park</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+H+J">Hyunwoo J. Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.16506v3-abstract-short" style="display: inline;"> Semantic image synthesis (SIS) is a task to generate realistic images corresponding to semantic maps (labels). However, in real-world applications, SIS often encounters noisy user inputs. To address this, we propose Stochastic Conditional Diffusion Model (SCDM), which is a robust conditional diffusion model that features novel forward and generation processes tailored for SIS with noisy labels. It… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.16506v3-abstract-full').style.display = 'inline'; document.getElementById('2402.16506v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.16506v3-abstract-full" style="display: none;"> Semantic image synthesis (SIS) is a task to generate realistic images corresponding to semantic maps (labels). However, in real-world applications, SIS often encounters noisy user inputs. To address this, we propose Stochastic Conditional Diffusion Model (SCDM), which is a robust conditional diffusion model that features novel forward and generation processes tailored for SIS with noisy labels. It enhances robustness by stochastically perturbing the semantic label maps through Label Diffusion, which diffuses the labels with discrete diffusion. Through the diffusion of labels, the noisy and clean semantic maps become similar as the timestep increases, eventually becoming identical at $t=T$. This facilitates the generation of an image close to a clean image, enabling robust generation. Furthermore, we propose a class-wise noise schedule to differentially diffuse the labels depending on the class. We demonstrate that the proposed method generates high-quality samples through extensive experiments and analyses on benchmark datasets, including a novel experimental setup simulating human errors during real-world applications. Code is available at https://github.com/mlvlab/SCDM. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.16506v3-abstract-full').style.display = 'none'; document.getElementById('2402.16506v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICML 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.12557">arXiv:2402.12557</a> <span> [<a href="https://arxiv.org/pdf/2402.12557">pdf</a>, <a href="https://arxiv.org/format/2402.12557">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Creating a Fine Grained Entity Type Taxonomy Using LLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gunn%2C+M">Michael Gunn</a>, <a href="/search/cs?searchtype=author&query=Park%2C+D">Dohyun Park</a>, <a href="/search/cs?searchtype=author&query=Kamath%2C+N">Nidhish Kamath</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.12557v1-abstract-short" style="display: inline;"> In this study, we investigate the potential of GPT-4 and its advanced iteration, GPT-4 Turbo, in autonomously developing a detailed entity type taxonomy. Our objective is to construct a comprehensive taxonomy, starting from a broad classification of entity types - including objects, time, locations, organizations, events, actions, and subjects - similar to existing manually curated taxonomies. Thi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.12557v1-abstract-full').style.display = 'inline'; document.getElementById('2402.12557v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.12557v1-abstract-full" style="display: none;"> In this study, we investigate the potential of GPT-4 and its advanced iteration, GPT-4 Turbo, in autonomously developing a detailed entity type taxonomy. Our objective is to construct a comprehensive taxonomy, starting from a broad classification of entity types - including objects, time, locations, organizations, events, actions, and subjects - similar to existing manually curated taxonomies. This classification is then progressively refined through iterative prompting techniques, leveraging GPT-4's internal knowledge base. The result is an extensive taxonomy comprising over 5000 nuanced entity types, which demonstrates remarkable quality upon subjective evaluation. We employed a straightforward yet effective prompting strategy, enabling the taxonomy to be dynamically expanded. The practical applications of this detailed taxonomy are diverse and significant. It facilitates the creation of new, more intricate branches through pattern-based combinations and notably enhances information extraction tasks, such as relation extraction and event argument extraction. Our methodology not only introduces an innovative approach to taxonomy creation but also opens new avenues for applying such taxonomies in various computational linguistics and AI-related fields. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.12557v1-abstract-full').style.display = 'none'; document.getElementById('2402.12557v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.01183">arXiv:2402.01183</a> <span> [<a href="https://arxiv.org/pdf/2402.01183">pdf</a>, <a href="https://arxiv.org/format/2402.01183">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> LINGO-Space: Language-Conditioned Incremental Grounding for Space </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kim%2C+D">Dohyun Kim</a>, <a href="/search/cs?searchtype=author&query=Oh%2C+N">Nayoung Oh</a>, <a href="/search/cs?searchtype=author&query=Hwang%2C+D">Deokmin Hwang</a>, <a href="/search/cs?searchtype=author&query=Park%2C+D">Daehyung Park</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.01183v1-abstract-short" style="display: inline;"> We aim to solve the problem of spatially localizing composite instructions referring to space: space grounding. Compared to current instance grounding, space grounding is challenging due to the ill-posedness of identifying locations referred to by discrete expressions and the compositional ambiguity of referring expressions. Therefore, we propose a novel probabilistic space-grounding methodology (… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.01183v1-abstract-full').style.display = 'inline'; document.getElementById('2402.01183v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.01183v1-abstract-full" style="display: none;"> We aim to solve the problem of spatially localizing composite instructions referring to space: space grounding. Compared to current instance grounding, space grounding is challenging due to the ill-posedness of identifying locations referred to by discrete expressions and the compositional ambiguity of referring expressions. Therefore, we propose a novel probabilistic space-grounding methodology (LINGO-Space) that accurately identifies a probabilistic distribution of space being referred to and incrementally updates it, given subsequent referring expressions leveraging configurable polar distributions. Our evaluations show that the estimation using polar distributions enables a robot to ground locations successfully through $20$ table-top manipulation benchmark tests. We also show that updating the distribution helps the grounding method accurately narrow the referring space. We finally demonstrate the robustness of the space grounding with simulated manipulation and real quadruped robot navigation tasks. Code and videos are available at https://lingo-space.github.io. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.01183v1-abstract-full').style.display = 'none'; document.getElementById('2402.01183v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by AAAI 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.12517">arXiv:2401.12517</a> <span> [<a href="https://arxiv.org/pdf/2401.12517">pdf</a>, <a href="https://arxiv.org/format/2401.12517">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> DDMI: Domain-Agnostic Latent Diffusion Models for Synthesizing High-Quality Implicit Neural Representations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Park%2C+D">Dogyun Park</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+S">Sihyeon Kim</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+S">Sojin Lee</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+H+J">Hyunwoo J. Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.12517v2-abstract-short" style="display: inline;"> Recent studies have introduced a new class of generative models for synthesizing implicit neural representations (INRs) that capture arbitrary continuous signals in various domains. These models opened the door for domain-agnostic generative models, but they often fail to achieve high-quality generation. We observed that the existing methods generate the weights of neural networks to parameterize… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.12517v2-abstract-full').style.display = 'inline'; document.getElementById('2401.12517v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.12517v2-abstract-full" style="display: none;"> Recent studies have introduced a new class of generative models for synthesizing implicit neural representations (INRs) that capture arbitrary continuous signals in various domains. These models opened the door for domain-agnostic generative models, but they often fail to achieve high-quality generation. We observed that the existing methods generate the weights of neural networks to parameterize INRs and evaluate the network with fixed positional embeddings (PEs). Arguably, this architecture limits the expressive power of generative models and results in low-quality INR generation. To address this limitation, we propose Domain-agnostic Latent Diffusion Model for INRs (DDMI) that generates adaptive positional embeddings instead of neural networks' weights. Specifically, we develop a Discrete-to-continuous space Variational AutoEncoder (D2C-VAE), which seamlessly connects discrete data and the continuous signal functions in the shared latent space. Additionally, we introduce a novel conditioning mechanism for evaluating INRs with the hierarchically decomposed PEs to further enhance expressive power. Extensive experiments across four modalities, e.g., 2D images, 3D shapes, Neural Radiance Fields, and videos, with seven benchmark datasets, demonstrate the versatility of DDMI and its superior performance compared to the existing INR generative models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.12517v2-abstract-full').style.display = 'none'; document.getElementById('2401.12517v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Park%2C+D&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Park%2C+D&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Park%2C+D&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Park%2C+D&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Park%2C+D&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Park%2C+D&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository