Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 561 results for author: <span class="mathjax">Cho, J</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Cho%2C+J">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Cho, J"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Cho%2C+J&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Cho, J"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Cho%2C+J&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Cho%2C+J&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Cho%2C+J&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Cho%2C+J&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Cho%2C+J&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Cho%2C+J&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.02548">arXiv:2502.02548</a> <span> [<a href="https://arxiv.org/pdf/2502.02548">pdf</a>, <a href="https://arxiv.org/format/2502.02548">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Mosaic3D: Foundation Dataset and Model for Open-Vocabulary 3D Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lee%2C+J">Junha Lee</a>, <a href="/search/cs?searchtype=author&query=Park%2C+C">Chunghyun Park</a>, <a href="/search/cs?searchtype=author&query=Choe%2C+J">Jaesung Choe</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y+F">Yu-Chiang Frank Wang</a>, <a href="/search/cs?searchtype=author&query=Kautz%2C+J">Jan Kautz</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+M">Minsu Cho</a>, <a href="/search/cs?searchtype=author&query=Choy%2C+C">Chris Choy</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.02548v1-abstract-short" style="display: inline;"> We tackle open-vocabulary 3D scene understanding by introducing a novel data generation pipeline and training framework. Our method addresses three critical requirements for effective training: precise 3D region segmentation, comprehensive textual descriptions, and sufficient dataset scale. By leveraging state-of-the-art open-vocabulary image segmentation models and region-aware Vision-Language Mo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02548v1-abstract-full').style.display = 'inline'; document.getElementById('2502.02548v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.02548v1-abstract-full" style="display: none;"> We tackle open-vocabulary 3D scene understanding by introducing a novel data generation pipeline and training framework. Our method addresses three critical requirements for effective training: precise 3D region segmentation, comprehensive textual descriptions, and sufficient dataset scale. By leveraging state-of-the-art open-vocabulary image segmentation models and region-aware Vision-Language Models, we develop an automatic pipeline that generates high-quality 3D mask-text pairs. Applying this pipeline to multiple 3D scene datasets, we create Mosaic3D-5.6M, a dataset of over 30K annotated scenes with 5.6M mask-text pairs, significantly larger than existing datasets. Building upon this data, we propose Mosaic3D, a foundation model combining a 3D encoder trained with contrastive learning and a lightweight mask decoder for open-vocabulary 3D semantic and instance segmentation. Our approach achieves state-of-the-art results on open-vocabulary 3D semantic and instance segmentation tasks including ScanNet200, Matterport3D, and ScanNet++, with ablation studies validating the effectiveness of our large-scale training data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02548v1-abstract-full').style.display = 'none'; document.getElementById('2502.02548v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">project page: https://nvlabs.github.io/Mosaic3D/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.16599">arXiv:2501.16599</a> <span> [<a href="https://arxiv.org/pdf/2501.16599">pdf</a>, <a href="https://arxiv.org/format/2501.16599">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Toward Safe Integration of UAM in Terminal Airspace: UAM Route Feasibility Assessment using Probabilistic Aircraft Trajectory Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cho%2C+J">Jungwoo Cho</a>, <a href="/search/cs?searchtype=author&query=Choi%2C+S">Seongjin Choi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.16599v1-abstract-short" style="display: inline;"> Integrating Urban Air Mobility (UAM) into airspace managed by Air Traffic Control (ATC) poses significant challenges, particularly in congested terminal environments. This study proposes a framework to assess the feasibility of UAM route integration using probabilistic aircraft trajectory prediction. By leveraging conditional Normalizing Flows, the framework predicts short-term trajectory distribu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16599v1-abstract-full').style.display = 'inline'; document.getElementById('2501.16599v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.16599v1-abstract-full" style="display: none;"> Integrating Urban Air Mobility (UAM) into airspace managed by Air Traffic Control (ATC) poses significant challenges, particularly in congested terminal environments. This study proposes a framework to assess the feasibility of UAM route integration using probabilistic aircraft trajectory prediction. By leveraging conditional Normalizing Flows, the framework predicts short-term trajectory distributions of conventional aircraft, enabling UAM vehicles to dynamically adjust speeds and maintain safe separations. The methodology was applied to airspace over Seoul metropolitan area, encompassing interactions between UAM and conventional traffic at multiple altitudes and lanes. The results reveal that different physical locations of lanes and routes experience varying interaction patterns and encounter dynamics. For instance, Lane 1 at lower altitudes (1,500 ft and 2,000 ft) exhibited minimal interactions with conventional aircraft, resulting in the largest separations and the most stable delay proportions. In contrast, Lane 4 near the airport experienced more frequent and complex interactions due to its proximity to departing traffic. The limited trajectory data for departing aircraft in this region occasionally led to tighter separations and increased operational challenges. This study underscores the potential of predictive modeling in facilitating UAM integration while highlighting critical trade-offs between safety and efficiency. The findings contribute to refining airspace management strategies and offer insights for scaling UAM operations in complex urban environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16599v1-abstract-full').style.display = 'none'; document.getElementById('2501.16599v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 7 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.13567">arXiv:2501.13567</a> <span> [<a href="https://arxiv.org/pdf/2501.13567">pdf</a>, <a href="https://arxiv.org/format/2501.13567">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> K-COMP: Retrieval-Augmented Medical Domain Question Answering With Knowledge-Injected Compressor </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cho%2C+J">Jeonghun Cho</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+G+G">Gary Geunbae Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.13567v2-abstract-short" style="display: inline;"> Retrieval-augmented question answering (QA) integrates external information and thereby increases the QA accuracy of reader models that lack domain knowledge. However, documents retrieved for closed domains require high expertise, so the reader model may have difficulty fully comprehending the text. Moreover, the retrieved documents contain thousands of tokens, some unrelated to the question. As a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13567v2-abstract-full').style.display = 'inline'; document.getElementById('2501.13567v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.13567v2-abstract-full" style="display: none;"> Retrieval-augmented question answering (QA) integrates external information and thereby increases the QA accuracy of reader models that lack domain knowledge. However, documents retrieved for closed domains require high expertise, so the reader model may have difficulty fully comprehending the text. Moreover, the retrieved documents contain thousands of tokens, some unrelated to the question. As a result, the documents include some inaccurate information, which could lead the reader model to mistrust the passages and could result in hallucinations. To solve these problems, we propose K-comp (Knowledge-injected compressor) which provides the knowledge required to answer correctly. The compressor automatically generates the prior knowledge necessary to facilitate the answer process prior to compression of the retrieved passages. Subsequently, the passages are compressed autoregressively, with the generated knowledge being integrated into the compression process. This process ensures alignment between the question intent and the compressed context. By augmenting this prior knowledge and concise context, the reader models are guided toward relevant answers and trust the context. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13567v2-abstract-full').style.display = 'none'; document.getElementById('2501.13567v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NAACL 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.06780">arXiv:2501.06780</a> <span> [<a href="https://arxiv.org/pdf/2501.06780">pdf</a>, <a href="https://arxiv.org/format/2501.06780">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Emerging Technologies">cs.ET</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Programming Languages">cs.PL</span> </div> </div> <p class="title is-5 mathjax"> COMPASS: A Compiler Framework for Resource-Constrained Crossbar-Array Based In-Memory Deep Learning Accelerators </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Park%2C+J">Jihoon Park</a>, <a href="/search/cs?searchtype=author&query=Choe%2C+J">Jeongin Choe</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+D">Dohyun Kim</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+J">Jae-Joon Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.06780v1-abstract-short" style="display: inline;"> Recently, crossbar array based in-memory accelerators have been gaining interest due to their high throughput and energy efficiency. While software and compiler support for the in-memory accelerators has also been introduced, they are currently limited to the case where all weights are assumed to be on-chip. This limitation becomes apparent with the significantly increasing network sizes compared… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06780v1-abstract-full').style.display = 'inline'; document.getElementById('2501.06780v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.06780v1-abstract-full" style="display: none;"> Recently, crossbar array based in-memory accelerators have been gaining interest due to their high throughput and energy efficiency. While software and compiler support for the in-memory accelerators has also been introduced, they are currently limited to the case where all weights are assumed to be on-chip. This limitation becomes apparent with the significantly increasing network sizes compared to the in-memory footprint. Weight replacement schemes are essential to address this issue. We propose COMPASS, a compiler framework for resource-constrained crossbar-based processing-in-memory (PIM) deep neural network (DNN) accelerators. COMPASS is specially targeted for networks that exceed the capacity of PIM crossbar arrays, necessitating access to external memories. We propose an algorithm to determine the optimal partitioning that divides the layers so that each partition can be accelerated on chip. Our scheme takes into account the data dependence between layers, core utilization, and the number of write instructions to minimize latency, memory accesses, and improve energy efficiency. Simulation results demonstrate that COMPASS can accommodate much more networks using a minimal memory footprint, while improving throughput by 1.78X and providing 1.28X savings in energy-delay product (EDP) over baseline partitioning methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06780v1-abstract-full').style.display = 'none'; document.getElementById('2501.06780v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted IEEE DATE 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.05906">arXiv:2501.05906</a> <span> [<a href="https://arxiv.org/pdf/2501.05906">pdf</a>, <a href="https://arxiv.org/format/2501.05906">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantum Physics">quant-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Q-MAML: Quantum Model-Agnostic Meta-Learning for Variational Quantum Algorithms </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lee%2C+J">Junyong Lee</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+J">JeiHee Cho</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+S">Shiho Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.05906v1-abstract-short" style="display: inline;"> In the Noisy Intermediate-Scale Quantum (NISQ) era, using variational quantum algorithms (VQAs) to solve optimization problems has become a key application. However, these algorithms face significant challenges, such as choosing an effective initial set of parameters and the limited quantum processing time that restricts the number of optimization iterations. In this study, we introduce a new fram… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.05906v1-abstract-full').style.display = 'inline'; document.getElementById('2501.05906v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.05906v1-abstract-full" style="display: none;"> In the Noisy Intermediate-Scale Quantum (NISQ) era, using variational quantum algorithms (VQAs) to solve optimization problems has become a key application. However, these algorithms face significant challenges, such as choosing an effective initial set of parameters and the limited quantum processing time that restricts the number of optimization iterations. In this study, we introduce a new framework for optimizing parameterized quantum circuits (PQCs) that employs a classical optimizer, inspired by Model-Agnostic Meta-Learning (MAML) technique. This approach aim to achieve better parameter initialization that ensures fast convergence. Our framework features a classical neural network, called Learner}, which interacts with a PQC using the output of Learner as an initial parameter. During the pre-training phase, Learner is trained with a meta-objective based on the quantum circuit cost function. In the adaptation phase, the framework requires only a few PQC updates to converge to a more accurate value, while the learner remains unchanged. This method is highly adaptable and is effectively extended to various Hamiltonian optimization problems. We validate our approach through experiments, including distribution function mapping and optimization of the Heisenberg XYZ Hamiltonian. The result implies that the Learner successfully estimates initial parameters that generalize across the problem space, enabling fast adaptation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.05906v1-abstract-full').style.display = 'none'; document.getElementById('2501.05906v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 8 figures, to be published in AAAI 25</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.01980">arXiv:2501.01980</a> <span> [<a href="https://arxiv.org/pdf/2501.01980">pdf</a>, <a href="https://arxiv.org/format/2501.01980">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3687767">10.1145/3687767 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Polarimetric BSSRDF Acquisition of Dynamic Faces </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ha%2C+H">Hyunho Ha</a>, <a href="/search/cs?searchtype=author&query=Hwang%2C+I">Inseung Hwang</a>, <a href="/search/cs?searchtype=author&query=Monzon%2C+N">Nestor Monzon</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+J">Jaemin Cho</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+D">Donggun Kim</a>, <a href="/search/cs?searchtype=author&query=Baek%2C+S">Seung-Hwan Baek</a>, <a href="/search/cs?searchtype=author&query=Mu%C3%B1oz%2C+A">Adolfo Mu帽oz</a>, <a href="/search/cs?searchtype=author&query=Gutierrez%2C+D">Diego Gutierrez</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+M+H">Min H. Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.01980v1-abstract-short" style="display: inline;"> Acquisition and modeling of polarized light reflection and scattering help reveal the shape, structure, and physical characteristics of an object, which is increasingly important in computer graphics. However, current polarimetric acquisition systems are limited to static and opaque objects. Human faces, on the other hand, present a particularly difficult challenge, given their complex structure a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.01980v1-abstract-full').style.display = 'inline'; document.getElementById('2501.01980v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.01980v1-abstract-full" style="display: none;"> Acquisition and modeling of polarized light reflection and scattering help reveal the shape, structure, and physical characteristics of an object, which is increasingly important in computer graphics. However, current polarimetric acquisition systems are limited to static and opaque objects. Human faces, on the other hand, present a particularly difficult challenge, given their complex structure and reflectance properties, the strong presence of spatially-varying subsurface scattering, and their dynamic nature. We present a new polarimetric acquisition method for dynamic human faces, which focuses on capturing spatially varying appearance and precise geometry, across a wide spectrum of skin tones and facial expressions. It includes both single and heterogeneous subsurface scattering, index of refraction, and specular roughness and intensity, among other parameters, while revealing biophysically-based components such as inner- and outer-layer hemoglobin, eumelanin and pheomelanin. Our method leverages such components' unique multispectral absorption profiles to quantify their concentrations, which in turn inform our model about the complex interactions occurring within the skin layers. To our knowledge, our work is the first to simultaneously acquire polarimetric and spectral reflectance information alongside biophysically-based skin parameters and geometry of dynamic human faces. Moreover, our polarimetric skin model integrates seamlessly into various rendering pipelines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.01980v1-abstract-full').style.display = 'none'; document.getElementById('2501.01980v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.3.7 </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> ACM Transactions on Graphics 43, 6, Article 275 (December 2024) </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.19459">arXiv:2412.19459</a> <span> [<a href="https://arxiv.org/pdf/2412.19459">pdf</a>, <a href="https://arxiv.org/format/2412.19459">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> A Prototype Unit for Image De-raining using Time-Lapse Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cho%2C+J">Jaehoon Cho</a>, <a href="/search/cs?searchtype=author&query=Yoo%2C+M">Minjung Yoo</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jini Yang</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+S">Sunok Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.19459v1-abstract-short" style="display: inline;"> We address the challenge of single-image de-raining, a task that involves recovering rain-free background information from a single rain image. While recent advancements have utilized real-world time-lapse data for training, enabling the estimation of consistent backgrounds and realistic rain streaks, these methods often suffer from computational and memory consumption, limiting their applicabilit… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.19459v1-abstract-full').style.display = 'inline'; document.getElementById('2412.19459v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.19459v1-abstract-full" style="display: none;"> We address the challenge of single-image de-raining, a task that involves recovering rain-free background information from a single rain image. While recent advancements have utilized real-world time-lapse data for training, enabling the estimation of consistent backgrounds and realistic rain streaks, these methods often suffer from computational and memory consumption, limiting their applicability in real-world scenarios. In this paper, we introduce a novel solution: the Rain Streak Prototype Unit (RsPU). The RsPU efficiently encodes rain streak-relevant features as real-time prototypes derived from time-lapse data, eliminating the need for excessive memory resources. Our de-raining network combines encoder-decoder networks with the RsPU, allowing us to learn and encapsulate diverse rain streak-relevant features as concise prototypes, employing an attention-based approach. To ensure the effectiveness of our approach, we propose a feature prototype loss encompassing cohesion and divergence components. This loss function captures both the compactness and diversity aspects of the prototypical rain streak features within the RsPU. Our method evaluates various de-raining benchmarks, accompanied by comprehensive ablation studies. We show that it can achieve competitive results in various rain images compared to state-of-the-art methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.19459v1-abstract-full').style.display = 'none'; document.getElementById('2412.19459v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by BMVC 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.18273">arXiv:2412.18273</a> <span> [<a href="https://arxiv.org/pdf/2412.18273">pdf</a>, <a href="https://arxiv.org/format/2412.18273">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Sampling Bag of Views for Open-Vocabulary Object Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Choi%2C+H">Hojun Choi</a>, <a href="/search/cs?searchtype=author&query=Choe%2C+J">Junsuk Choe</a>, <a href="/search/cs?searchtype=author&query=Shim%2C+H">Hyunjung Shim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.18273v1-abstract-short" style="display: inline;"> Existing open-vocabulary object detection (OVD) develops methods for testing unseen categories by aligning object region embeddings with corresponding VLM features. A recent study leverages the idea that VLMs implicitly learn compositional structures of semantic concepts within the image. Instead of using an individual region embedding, it utilizes a bag of region embeddings as a new representatio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18273v1-abstract-full').style.display = 'inline'; document.getElementById('2412.18273v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.18273v1-abstract-full" style="display: none;"> Existing open-vocabulary object detection (OVD) develops methods for testing unseen categories by aligning object region embeddings with corresponding VLM features. A recent study leverages the idea that VLMs implicitly learn compositional structures of semantic concepts within the image. Instead of using an individual region embedding, it utilizes a bag of region embeddings as a new representation to incorporate compositional structures into the OVD task. However, this approach often fails to capture the contextual concepts of each region, leading to noisy compositional structures. This results in only marginal performance improvements and reduced efficiency. To address this, we propose a novel concept-based alignment method that samples a more powerful and efficient compositional structure. Our approach groups contextually related ``concepts'' into a bag and adjusts the scale of concepts within the bag for more effective embedding alignment. Combined with Faster R-CNN, our method achieves improvements of 2.6 box AP50 and 0.5 mask AP over prior work on novel categories in the open-vocabulary COCO and LVIS benchmarks. Furthermore, our method reduces CLIP computation in FLOPs by 80.3% compared to previous research, significantly enhancing efficiency. Experimental results demonstrate that the proposed method outperforms previous state-of-the-art models on the OVD datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18273v1-abstract-full').style.display = 'none'; document.getElementById('2412.18273v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">19 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.16978">arXiv:2412.16978</a> <span> [<a href="https://arxiv.org/pdf/2412.16978">pdf</a>, <a href="https://arxiv.org/format/2412.16978">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> PromptDresser: Improving the Quality and Controllability of Virtual Try-On via Generative Textual Prompt and Prompt-aware Mask </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kim%2C+J">Jeongho Kim</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+H">Hoiyeong Jin</a>, <a href="/search/cs?searchtype=author&query=Park%2C+S">Sunghyun Park</a>, <a href="/search/cs?searchtype=author&query=Choo%2C+J">Jaegul Choo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.16978v1-abstract-short" style="display: inline;"> Recent virtual try-on approaches have advanced by fine-tuning the pre-trained text-to-image diffusion models to leverage their powerful generative ability. However, the use of text prompts in virtual try-on is still underexplored. This paper tackles a text-editable virtual try-on task that changes the clothing item based on the provided clothing image while editing the wearing style (e.g., tucking… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16978v1-abstract-full').style.display = 'inline'; document.getElementById('2412.16978v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.16978v1-abstract-full" style="display: none;"> Recent virtual try-on approaches have advanced by fine-tuning the pre-trained text-to-image diffusion models to leverage their powerful generative ability. However, the use of text prompts in virtual try-on is still underexplored. This paper tackles a text-editable virtual try-on task that changes the clothing item based on the provided clothing image while editing the wearing style (e.g., tucking style, fit) according to the text descriptions. In the text-editable virtual try-on, three key aspects exist: (i) designing rich text descriptions for paired person-clothing data to train the model, (ii) addressing the conflicts where textual information of the existing person's clothing interferes the generation of the new clothing, and (iii) adaptively adjust the inpainting mask aligned with the text descriptions, ensuring proper editing areas while preserving the original person's appearance irrelevant to the new clothing. To address these aspects, we propose PromptDresser, a text-editable virtual try-on model that leverages large multimodal model (LMM) assistance to enable high-quality and versatile manipulation based on generative text prompts. Our approach utilizes LMMs via in-context learning to generate detailed text descriptions for person and clothing images independently, including pose details and editing attributes using minimal human cost. Moreover, to ensure the editing areas, we adjust the inpainting mask depending on the text prompts adaptively. We found that our approach, utilizing detailed text prompts, not only enhances text editability but also effectively conveys clothing details that are difficult to capture through images alone, thereby enhancing image quality. Our code is available at https://github.com/rlawjdghek/PromptDresser. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16978v1-abstract-full').style.display = 'none'; document.getElementById('2412.16978v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">20 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.16085">arXiv:2412.16085</a> <span> [<a href="https://arxiv.org/pdf/2412.16085">pdf</a>, <a href="https://arxiv.org/format/2412.16085">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Efficient MedSAMs: Segment Anything in Medical Images on Laptop </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ma%2C+J">Jun Ma</a>, <a href="/search/cs?searchtype=author&query=Li%2C+F">Feifei Li</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+S">Sumin Kim</a>, <a href="/search/cs?searchtype=author&query=Asakereh%2C+R">Reza Asakereh</a>, <a href="/search/cs?searchtype=author&query=Le%2C+B">Bao-Hiep Le</a>, <a href="/search/cs?searchtype=author&query=Nguyen-Vu%2C+D">Dang-Khoa Nguyen-Vu</a>, <a href="/search/cs?searchtype=author&query=Pfefferle%2C+A">Alexander Pfefferle</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+M">Muxin Wei</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+R">Ruochen Gao</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+D">Donghang Lyu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Songxiao Yang</a>, <a href="/search/cs?searchtype=author&query=Purucker%2C+L">Lennart Purucker</a>, <a href="/search/cs?searchtype=author&query=Marinov%2C+Z">Zdravko Marinov</a>, <a href="/search/cs?searchtype=author&query=Staring%2C+M">Marius Staring</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Haisheng Lu</a>, <a href="/search/cs?searchtype=author&query=Dao%2C+T+T">Thuy Thanh Dao</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+X">Xincheng Ye</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhi Li</a>, <a href="/search/cs?searchtype=author&query=Brugnara%2C+G">Gianluca Brugnara</a>, <a href="/search/cs?searchtype=author&query=Vollmuth%2C+P">Philipp Vollmuth</a>, <a href="/search/cs?searchtype=author&query=Foltyn-Dumitru%2C+M">Martha Foltyn-Dumitru</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+J">Jaeyoung Cho</a>, <a href="/search/cs?searchtype=author&query=Mahmutoglu%2C+M+A">Mustafa Ahmed Mahmutoglu</a>, <a href="/search/cs?searchtype=author&query=Bendszus%2C+M">Martin Bendszus</a>, <a href="/search/cs?searchtype=author&query=Pfl%C3%BCger%2C+I">Irada Pfl眉ger</a> , et al. (57 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.16085v1-abstract-short" style="display: inline;"> Promptable segmentation foundation models have emerged as a transformative approach to addressing the diverse needs in medical images, but most existing models require expensive computing, posing a big barrier to their adoption in clinical practice. In this work, we organized the first international competition dedicated to promptable medical image segmentation, featuring a large-scale dataset spa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16085v1-abstract-full').style.display = 'inline'; document.getElementById('2412.16085v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.16085v1-abstract-full" style="display: none;"> Promptable segmentation foundation models have emerged as a transformative approach to addressing the diverse needs in medical images, but most existing models require expensive computing, posing a big barrier to their adoption in clinical practice. In this work, we organized the first international competition dedicated to promptable medical image segmentation, featuring a large-scale dataset spanning nine common imaging modalities from over 20 different institutions. The top teams developed lightweight segmentation foundation models and implemented an efficient inference pipeline that substantially reduced computational requirements while maintaining state-of-the-art segmentation accuracy. Moreover, the post-challenge phase advanced the algorithms through the design of performance booster and reproducibility tasks, resulting in improved algorithms and validated reproducibility of the winning solution. Furthermore, the best-performing algorithms have been incorporated into the open-source software with a user-friendly interface to facilitate clinical adoption. The data and code are publicly available to foster the further development of medical image segmentation foundation models and pave the way for impactful real-world applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16085v1-abstract-full').style.display = 'none'; document.getElementById('2412.16085v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CVPR 2024 MedSAM on Laptop Competition Summary: https://www.codabench.org/competitions/1847/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.13569">arXiv:2412.13569</a> <span> [<a href="https://arxiv.org/pdf/2412.13569">pdf</a>, <a href="https://arxiv.org/format/2412.13569">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Multi-View Pedestrian Occupancy Prediction with a Novel Synthetic Dataset </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Aung%2C+S">Sithu Aung</a>, <a href="/search/cs?searchtype=author&query=Sagong%2C+M">Min-Cheol Sagong</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+J">Junghyun Cho</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.13569v1-abstract-short" style="display: inline;"> We address an advanced challenge of predicting pedestrian occupancy as an extension of multi-view pedestrian detection in urban traffic. To support this, we have created a new synthetic dataset called MVP-Occ, designed for dense pedestrian scenarios in large-scale scenes. Our dataset provides detailed representations of pedestrians using voxel structures, accompanied by rich semantic scene underst… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.13569v1-abstract-full').style.display = 'inline'; document.getElementById('2412.13569v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.13569v1-abstract-full" style="display: none;"> We address an advanced challenge of predicting pedestrian occupancy as an extension of multi-view pedestrian detection in urban traffic. To support this, we have created a new synthetic dataset called MVP-Occ, designed for dense pedestrian scenarios in large-scale scenes. Our dataset provides detailed representations of pedestrians using voxel structures, accompanied by rich semantic scene understanding labels, facilitating visual navigation and insights into pedestrian spatial information. Furthermore, we present a robust baseline model, termed OmniOcc, capable of predicting both the voxel occupancy state and panoptic labels for the entire scene from multi-view images. Through in-depth analysis, we identify and evaluate the key elements of our proposed model, highlighting their specific contributions and importance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.13569v1-abstract-full').style.display = 'none'; document.getElementById('2412.13569v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">AAAI 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.13469">arXiv:2412.13469</a> <span> [<a href="https://arxiv.org/pdf/2412.13469">pdf</a>, <a href="https://arxiv.org/format/2412.13469">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> Enabling Region-Specific Control via Lassos in Point-Based Colorization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lee%2C+S">Sanghyeon Lee</a>, <a href="/search/cs?searchtype=author&query=Yun%2C+J">Jooyeol Yun</a>, <a href="/search/cs?searchtype=author&query=Choo%2C+J">Jaegul Choo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.13469v2-abstract-short" style="display: inline;"> Point-based interactive colorization techniques allow users to effortlessly colorize grayscale images using user-provided color hints. However, point-based methods often face challenges when different colors are given to semantically similar areas, leading to color intermingling and unsatisfactory results-an issue we refer to as color collapse. The fundamental cause of color collapse is the inadeq… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.13469v2-abstract-full').style.display = 'inline'; document.getElementById('2412.13469v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.13469v2-abstract-full" style="display: none;"> Point-based interactive colorization techniques allow users to effortlessly colorize grayscale images using user-provided color hints. However, point-based methods often face challenges when different colors are given to semantically similar areas, leading to color intermingling and unsatisfactory results-an issue we refer to as color collapse. The fundamental cause of color collapse is the inadequacy of points for defining the boundaries for each color. To mitigate color collapse, we introduce a lasso tool that can control the scope of each color hint. Additionally, we design a framework that leverages the user-provided lassos to localize the attention masks. The experimental results show that using a single lasso is as effective as applying 4.18 individual color hints and can achieve the desired outcomes in 30% less time than using points alone. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.13469v2-abstract-full').style.display = 'none'; document.getElementById('2412.13469v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to AAAI2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.13401">arXiv:2412.13401</a> <span> [<a href="https://arxiv.org/pdf/2412.13401">pdf</a>, <a href="https://arxiv.org/format/2412.13401">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Zero-Shot Low Light Image Enhancement with Diffusion Prior </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cho%2C+J">Joshua Cho</a>, <a href="/search/cs?searchtype=author&query=Aghajanzadeh%2C+S">Sara Aghajanzadeh</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Z">Zhen Zhu</a>, <a href="/search/cs?searchtype=author&query=Forsyth%2C+D+A">D. A. Forsyth</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.13401v2-abstract-short" style="display: inline;"> Balancing aesthetic quality with fidelity when enhancing images from challenging, degraded sources is a core objective in computational photography. In this paper, we address low light image enhancement (LLIE), a task in which dark images often contain limited visible information. Diffusion models, known for their powerful image enhancement capacities, are a natural choice for this problem. Howeve… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.13401v2-abstract-full').style.display = 'inline'; document.getElementById('2412.13401v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.13401v2-abstract-full" style="display: none;"> Balancing aesthetic quality with fidelity when enhancing images from challenging, degraded sources is a core objective in computational photography. In this paper, we address low light image enhancement (LLIE), a task in which dark images often contain limited visible information. Diffusion models, known for their powerful image enhancement capacities, are a natural choice for this problem. However, their deep generative priors can also lead to hallucinations, introducing non-existent elements or substantially altering the visual semantics of the original scene. In this work, we introduce a novel zero-shot method for controlling and refining the generative behavior of diffusion models for dark-to-light image conversion tasks. Our method demonstrates superior performance over existing state-of-the-art methods in the task of low-light image enhancement, as evidenced by both quantitative metrics and qualitative analysis. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.13401v2-abstract-full').style.display = 'none'; document.getElementById('2412.13401v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.10208">arXiv:2412.10208</a> <span> [<a href="https://arxiv.org/pdf/2412.10208">pdf</a>, <a href="https://arxiv.org/format/2412.10208">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Efficient Generative Modeling with Residual Vector Quantization-Based Tokens </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kim%2C+J">Jaehyeon Kim</a>, <a href="/search/cs?searchtype=author&query=Moon%2C+T">Taehong Moon</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+K">Keon Lee</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+J">Jaewoong Cho</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.10208v2-abstract-short" style="display: inline;"> We explore the use of Residual Vector Quantization (RVQ) for high-fidelity generation in vector-quantized generative models. This quantization technique maintains higher data fidelity by employing more in-depth tokens. However, increasing the token number in generative models leads to slower inference speeds. To this end, we introduce ResGen, an efficient RVQ-based discrete diffusion model that ge… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.10208v2-abstract-full').style.display = 'inline'; document.getElementById('2412.10208v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.10208v2-abstract-full" style="display: none;"> We explore the use of Residual Vector Quantization (RVQ) for high-fidelity generation in vector-quantized generative models. This quantization technique maintains higher data fidelity by employing more in-depth tokens. However, increasing the token number in generative models leads to slower inference speeds. To this end, we introduce ResGen, an efficient RVQ-based discrete diffusion model that generates high-fidelity samples without compromising sampling speed. Our key idea is a direct prediction of vector embedding of collective tokens rather than individual ones. Moreover, we demonstrate that our proposed token masking and multi-token prediction method can be formulated within a principled probabilistic framework using a discrete diffusion process and variational inference. We validate the efficacy and generalizability of the proposed method on two challenging tasks across different modalities: conditional image generation} on ImageNet 256x256 and zero-shot text-to-speech synthesis. Experimental results demonstrate that ResGen outperforms autoregressive counterparts in both tasks, delivering superior performance without compromising sampling speed. Furthermore, as we scale the depth of RVQ, our generative models exhibit enhanced generation fidelity or faster sampling speeds compared to similarly sized baseline models. The project page can be found at https://resgen-genai.github.io <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.10208v2-abstract-full').style.display = 'none'; document.getElementById('2412.10208v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.08890">arXiv:2412.08890</a> <span> [<a href="https://arxiv.org/pdf/2412.08890">pdf</a>, <a href="https://arxiv.org/format/2412.08890">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Lexico: Extreme KV Cache Compression via Sparse Coding over Universal Dictionaries </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kim%2C+J">Junhyuck Kim</a>, <a href="/search/cs?searchtype=author&query=Park%2C+J">Jongho Park</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+J">Jaewoong Cho</a>, <a href="/search/cs?searchtype=author&query=Papailiopoulos%2C+D">Dimitris Papailiopoulos</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.08890v1-abstract-short" style="display: inline;"> We introduce Lexico, a novel KV cache compression method that leverages sparse coding with a universal dictionary. Our key finding is that key-value cache in modern LLMs can be accurately approximated using sparse linear combination from a small, input-agnostic dictionary of ~4k atoms, enabling efficient compression across different input prompts, tasks and models. Using orthogonal matching pursui… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.08890v1-abstract-full').style.display = 'inline'; document.getElementById('2412.08890v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.08890v1-abstract-full" style="display: none;"> We introduce Lexico, a novel KV cache compression method that leverages sparse coding with a universal dictionary. Our key finding is that key-value cache in modern LLMs can be accurately approximated using sparse linear combination from a small, input-agnostic dictionary of ~4k atoms, enabling efficient compression across different input prompts, tasks and models. Using orthogonal matching pursuit for sparse approximation, Lexico achieves flexible compression ratios through direct sparsity control. On GSM8K, across multiple model families (Mistral, Llama 3, Qwen2.5), Lexico maintains 90-95% of the original performance while using only 15-25% of the full KV-cache memory, outperforming both quantization and token eviction methods. Notably, Lexico remains effective in low memory regimes where 2-bit quantization fails, achieving up to 1.7x better compression on LongBench and GSM8K while maintaining high accuracy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.08890v1-abstract-full').style.display = 'none'; document.getElementById('2412.08890v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">18 pages, 7 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.07813">arXiv:2412.07813</a> <span> [<a href="https://arxiv.org/pdf/2412.07813">pdf</a>, <a href="https://arxiv.org/format/2412.07813">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> How Can Incentives and Cut Layer Selection Influence Data Contribution in Split Federated Learning? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lee%2C+J">Joohyung Lee</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+J">Jungchan Cho</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+W">Wonjun Lee</a>, <a href="/search/cs?searchtype=author&query=Seif%2C+M">Mohamed Seif</a>, <a href="/search/cs?searchtype=author&query=Poor%2C+H+V">H. Vincent Poor</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.07813v3-abstract-short" style="display: inline;"> To alleviate the training burden in federated learning while enhancing convergence speed, Split Federated Learning (SFL) has emerged as a promising approach by combining the advantages of federated and split learning. However, recent studies have largely overlooked competitive situations. In this framework, the SFL model owner can choose the cut layer to balance the training load between the serve… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.07813v3-abstract-full').style.display = 'inline'; document.getElementById('2412.07813v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.07813v3-abstract-full" style="display: none;"> To alleviate the training burden in federated learning while enhancing convergence speed, Split Federated Learning (SFL) has emerged as a promising approach by combining the advantages of federated and split learning. However, recent studies have largely overlooked competitive situations. In this framework, the SFL model owner can choose the cut layer to balance the training load between the server and clients, ensuring the necessary level of privacy for the clients. Additionally, the SFL model owner sets incentives to encourage client participation in the SFL process. The optimization strategies employed by the SFL model owner influence clients' decisions regarding the amount of data they contribute, taking into account the shared incentives over clients and anticipated energy consumption during SFL. To address this framework, we model the problem using a hierarchical decision-making approach, formulated as a single-leader multi-follower Stackelberg game. We demonstrate the existence and uniqueness of the Nash equilibrium among clients and analyze the Stackelberg equilibrium by examining the leader's game. Furthermore, we discuss privacy concerns related to differential privacy and the criteria for selecting the minimum required cut layer. Our findings show that the Stackelberg equilibrium solution maximizes the utility for both the clients and the SFL model owner. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.07813v3-abstract-full').style.display = 'none'; document.getElementById('2412.07813v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 10 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.05276">arXiv:2412.05276</a> <span> [<a href="https://arxiv.org/pdf/2412.05276">pdf</a>, <a href="https://arxiv.org/format/2412.05276">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Sparse autoencoders reveal selective remapping of visual concepts during adaptation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lim%2C+H">Hyesu Lim</a>, <a href="/search/cs?searchtype=author&query=Choi%2C+J">Jinho Choi</a>, <a href="/search/cs?searchtype=author&query=Choo%2C+J">Jaegul Choo</a>, <a href="/search/cs?searchtype=author&query=Schneider%2C+S">Steffen Schneider</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.05276v1-abstract-short" style="display: inline;"> Adapting foundation models for specific purposes has become a standard approach to build machine learning systems for downstream applications. Yet, it is an open question which mechanisms take place during adaptation. Here we develop a new Sparse Autoencoder (SAE) for the CLIP vision transformer, named PatchSAE, to extract interpretable concepts at granular levels (e.g. shape, color, or semantics… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.05276v1-abstract-full').style.display = 'inline'; document.getElementById('2412.05276v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.05276v1-abstract-full" style="display: none;"> Adapting foundation models for specific purposes has become a standard approach to build machine learning systems for downstream applications. Yet, it is an open question which mechanisms take place during adaptation. Here we develop a new Sparse Autoencoder (SAE) for the CLIP vision transformer, named PatchSAE, to extract interpretable concepts at granular levels (e.g. shape, color, or semantics of an object) and their patch-wise spatial attributions. We explore how these concepts influence the model output in downstream image classification tasks and investigate how recent state-of-the-art prompt-based adaptation techniques change the association of model inputs to these concepts. While activations of concepts slightly change between adapted and non-adapted models, we find that the majority of gains on common adaptation tasks can be explained with the existing concepts already present in the non-adapted foundation model. This work provides a concrete framework to train and use SAEs for Vision Transformers and provides insights into explaining adaptation mechanisms. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.05276v1-abstract-full').style.display = 'none'; document.getElementById('2412.05276v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">A demo is available at github.com/dynamical-inference/patchsae</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.04459">arXiv:2412.04459</a> <span> [<a href="https://arxiv.org/pdf/2412.04459">pdf</a>, <a href="https://arxiv.org/format/2412.04459">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> Sparse Voxels Rasterization: Real-time High-fidelity Radiance Field Rendering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+C">Cheng Sun</a>, <a href="/search/cs?searchtype=author&query=Choe%2C+J">Jaesung Choe</a>, <a href="/search/cs?searchtype=author&query=Loop%2C+C">Charles Loop</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+W">Wei-Chiu Ma</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y+F">Yu-Chiang Frank Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.04459v1-abstract-short" style="display: inline;"> We propose an efficient radiance field rendering algorithm that incorporates a rasterization process on sparse voxels without neural networks or 3D Gaussians. There are two key contributions coupled with the proposed system. The first is to render sparse voxels in the correct depth order along pixel rays by using dynamic Morton ordering. This avoids the well-known popping artifact found in Gaussia… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.04459v1-abstract-full').style.display = 'inline'; document.getElementById('2412.04459v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.04459v1-abstract-full" style="display: none;"> We propose an efficient radiance field rendering algorithm that incorporates a rasterization process on sparse voxels without neural networks or 3D Gaussians. There are two key contributions coupled with the proposed system. The first is to render sparse voxels in the correct depth order along pixel rays by using dynamic Morton ordering. This avoids the well-known popping artifact found in Gaussian splatting. Second, we adaptively fit sparse voxels to different levels of detail within scenes, faithfully reproducing scene details while achieving high rendering frame rates. Our method improves the previous neural-free voxel grid representation by over 4db PSNR and more than 10x rendering FPS speedup, achieving state-of-the-art comparable novel-view synthesis results. Additionally, our neural-free sparse voxels are seamlessly compatible with grid-based 3D processing algorithms. We achieve promising mesh reconstruction accuracy by integrating TSDF-Fusion and Marching Cubes into our sparse grid system. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.04459v1-abstract-full').style.display = 'none'; document.getElementById('2412.04459v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Code release in progress</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.02334">arXiv:2412.02334</a> <span> [<a href="https://arxiv.org/pdf/2412.02334">pdf</a>, <a href="https://arxiv.org/format/2412.02334">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantum Physics">quant-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Reinforcement learning to learn quantum states for Heisenberg scaling accuracy </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jae%2C+J">Jeongwoo Jae</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+J">Jeonghoon Hong</a>, <a href="/search/cs?searchtype=author&query=Choo%2C+J">Jinho Choo</a>, <a href="/search/cs?searchtype=author&query=Kwon%2C+Y">Yeong-Dae Kwon</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.02334v1-abstract-short" style="display: inline;"> Learning quantum states is a crucial task for realizing the potential of quantum information technology. Recently, neural approaches have emerged as promising methods for learning quantum states. We propose a meta-learning model that employs reinforcement learning (RL) to optimize the process of learning quantum states. For learning quantum states, our scheme trains a Hardware efficient ansatz wit… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.02334v1-abstract-full').style.display = 'inline'; document.getElementById('2412.02334v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.02334v1-abstract-full" style="display: none;"> Learning quantum states is a crucial task for realizing the potential of quantum information technology. Recently, neural approaches have emerged as promising methods for learning quantum states. We propose a meta-learning model that employs reinforcement learning (RL) to optimize the process of learning quantum states. For learning quantum states, our scheme trains a Hardware efficient ansatz with a blackbox optimization algorithm, called evolution strategy (ES). To enhance the efficiency of ES, a RL agent dynamically adjusts the hyperparameters of ES. To facilitate the RL training, we introduce an action repetition strategy inspired by curriculum learning. The RL agent significantly improves the sample efficiency of learning random quantum states, and achieves infidelity scaling close to the Heisenberg limit. We showcase that the RL agent trained using 3-qubit states can be generalized to learning up to 5-qubit states. These results highlight the utility of RL-driven meta-learning to enhance the efficiency and generalizability of learning quantum states. Our approach can be applicable to improve quantum control, quantum optimization, and quantum machine learning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.02334v1-abstract-full').style.display = 'none'; document.getElementById('2412.02334v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">14 pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.02122">arXiv:2412.02122</a> <span> [<a href="https://arxiv.org/pdf/2412.02122">pdf</a>, <a href="https://arxiv.org/format/2412.02122">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Improving Sequential Recommender Systems with Online and In-store User Behavior </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ma%2C+L">Luyi Ma</a>, <a href="/search/cs?searchtype=author&query=Padmanabhan%2C+A">Aashika Padmanabhan</a>, <a href="/search/cs?searchtype=author&query=Ganesh%2C+A">Anjana Ganesh</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+S">Shengwei Tang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jiao Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiaohan Li</a>, <a href="/search/cs?searchtype=author&query=Morishetti%2C+L">Lalitesh Morishetti</a>, <a href="/search/cs?searchtype=author&query=Nag%2C+K">Kaushiki Nag</a>, <a href="/search/cs?searchtype=author&query=Patel%2C+M">Malay Patel</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+J">Jason Cho</a>, <a href="/search/cs?searchtype=author&query=Kumar%2C+S">Sushant Kumar</a>, <a href="/search/cs?searchtype=author&query=Achan%2C+K">Kannan Achan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.02122v1-abstract-short" style="display: inline;"> Online e-commerce platforms have been extending in-store shopping, which allows users to keep the canonical online browsing and checkout experience while exploring in-store shopping. However, the growing transition between online and in-store becomes a challenge to sequential recommender systems for future online interaction prediction due to the lack of holistic modeling of hybrid user behaviors… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.02122v1-abstract-full').style.display = 'inline'; document.getElementById('2412.02122v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.02122v1-abstract-full" style="display: none;"> Online e-commerce platforms have been extending in-store shopping, which allows users to keep the canonical online browsing and checkout experience while exploring in-store shopping. However, the growing transition between online and in-store becomes a challenge to sequential recommender systems for future online interaction prediction due to the lack of holistic modeling of hybrid user behaviors (online and in-store). The challenges are twofold. First, combining online and in-store user behavior data into a single data schema and supporting multiple stages in the model life cycle (pre-training, training, inference, etc.) organically needs a new data pipeline design. Second, online recommender systems, which solely rely on online user behavior sequences, must be redesigned to support online and in-store user data as input under the sequential modeling setting. To overcome the first challenge, we propose a hybrid, omnichannel data pipeline to compile online and in-store user behavior data by caching information from diverse data sources. Later, we introduce a model-agnostic encoder module to the sequential recommender system to interpret the user in-store transaction and augment the modeling capacity for better online interaction prediction given the hybrid user behavior. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.02122v1-abstract-full').style.display = 'none'; document.getElementById('2412.02122v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">6 pages, IEEE BigData 2024 Workshop</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.00621">arXiv:2412.00621</a> <span> [<a href="https://arxiv.org/pdf/2412.00621">pdf</a>, <a href="https://arxiv.org/format/2412.00621">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> Exposing LLM Vulnerabilities: Adversarial Scam Detection and Performance </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chang%2C+C">Chen-Wei Chang</a>, <a href="/search/cs?searchtype=author&query=Sarkar%2C+S">Shailik Sarkar</a>, <a href="/search/cs?searchtype=author&query=Mitra%2C+S">Shutonu Mitra</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Qi Zhang</a>, <a href="/search/cs?searchtype=author&query=Salemi%2C+H">Hossein Salemi</a>, <a href="/search/cs?searchtype=author&query=Purohit%2C+H">Hemant Purohit</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+F">Fengxiu Zhang</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+M">Michin Hong</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+J">Jin-Hee Cho</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+C">Chang-Tien Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.00621v1-abstract-short" style="display: inline;"> Can we trust Large Language Models (LLMs) to accurately predict scam? This paper investigates the vulnerabilities of LLMs when facing adversarial scam messages for the task of scam detection. We addressed this issue by creating a comprehensive dataset with fine-grained labels of scam messages, including both original and adversarial scam messages. The dataset extended traditional binary classes fo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.00621v1-abstract-full').style.display = 'inline'; document.getElementById('2412.00621v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.00621v1-abstract-full" style="display: none;"> Can we trust Large Language Models (LLMs) to accurately predict scam? This paper investigates the vulnerabilities of LLMs when facing adversarial scam messages for the task of scam detection. We addressed this issue by creating a comprehensive dataset with fine-grained labels of scam messages, including both original and adversarial scam messages. The dataset extended traditional binary classes for the scam detection task into more nuanced scam types. Our analysis showed how adversarial examples took advantage of vulnerabilities of a LLM, leading to high misclassification rate. We evaluated the performance of LLMs on these adversarial scam messages and proposed strategies to improve their robustness. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.00621v1-abstract-full').style.display = 'none'; document.getElementById('2412.00621v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">4 pages, 2024 IEEE International Conference on Big Data workshop BigEACPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.19527">arXiv:2411.19527</a> <span> [<a href="https://arxiv.org/pdf/2411.19527">pdf</a>, <a href="https://arxiv.org/format/2411.19527">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> DisCoRD: Discrete Tokens to Continuous Motion via Rectified Flow Decoding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cho%2C+J">Jungbin Cho</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+J">Junwan Kim</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+J">Jisoo Kim</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+M">Minseo Kim</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+M">Mingu Kang</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+S">Sungeun Hong</a>, <a href="/search/cs?searchtype=author&query=Oh%2C+T">Tae-Hyun Oh</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Y">Youngjae Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.19527v2-abstract-short" style="display: inline;"> Human motion, inherently continuous and dynamic, presents significant challenges for generative models. Despite their dominance, discrete quantization methods, such as VQ-VAEs, suffer from inherent limitations, including restricted expressiveness and frame-wise noise artifacts. Continuous approaches, while producing smoother and more natural motions, often falter due to high-dimensional complexity… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.19527v2-abstract-full').style.display = 'inline'; document.getElementById('2411.19527v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.19527v2-abstract-full" style="display: none;"> Human motion, inherently continuous and dynamic, presents significant challenges for generative models. Despite their dominance, discrete quantization methods, such as VQ-VAEs, suffer from inherent limitations, including restricted expressiveness and frame-wise noise artifacts. Continuous approaches, while producing smoother and more natural motions, often falter due to high-dimensional complexity and limited training data. To resolve this "discord" between discrete and continuous representations, we introduce DisCoRD: Discrete Tokens to Continuous Motion via Rectified Flow Decoding, a novel method that decodes discrete motion tokens into continuous motion through rectified flow. By employing an iterative refinement process in the continuous space, DisCoRD captures fine-grained dynamics and ensures smoother and more natural motions. Compatible with any discrete-based framework, our method enhances naturalness without compromising faithfulness to the conditioning signals. Extensive evaluations demonstrate that DisCoRD achieves state-of-the-art performance, with FID of 0.032 on HumanML3D and 0.169 on KIT-ML. These results solidify DisCoRD as a robust solution for bridging the divide between discrete efficiency and continuous realism. Our project page is available at: https://whwjdqls.github.io/discord.github.io/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.19527v2-abstract-full').style.display = 'none'; document.getElementById('2411.19527v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">20 pages 18 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.18664">arXiv:2411.18664</a> <span> [<a href="https://arxiv.org/pdf/2411.18664">pdf</a>, <a href="https://arxiv.org/format/2411.18664">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Spatiotemporal Skip Guidance for Enhanced Video Diffusion Sampling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hyung%2C+J">Junha Hyung</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+K">Kinam Kim</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+S">Susung Hong</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+M">Min-Jung Kim</a>, <a href="/search/cs?searchtype=author&query=Choo%2C+J">Jaegul Choo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.18664v1-abstract-short" style="display: inline;"> Diffusion models have emerged as a powerful tool for generating high-quality images, videos, and 3D content. While sampling guidance techniques like CFG improve quality, they reduce diversity and motion. Autoguidance mitigates these issues but demands extra weak model training, limiting its practicality for large-scale models. In this work, we introduce Spatiotemporal Skip Guidance (STG), a simple… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18664v1-abstract-full').style.display = 'inline'; document.getElementById('2411.18664v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.18664v1-abstract-full" style="display: none;"> Diffusion models have emerged as a powerful tool for generating high-quality images, videos, and 3D content. While sampling guidance techniques like CFG improve quality, they reduce diversity and motion. Autoguidance mitigates these issues but demands extra weak model training, limiting its practicality for large-scale models. In this work, we introduce Spatiotemporal Skip Guidance (STG), a simple training-free sampling guidance method for enhancing transformer-based video diffusion models. STG employs an implicit weak model via self-perturbation, avoiding the need for external models or additional training. By selectively skipping spatiotemporal layers, STG produces an aligned, degraded version of the original model to boost sample quality without compromising diversity or dynamic degree. Our contributions include: (1) introducing STG as an efficient, high-performing guidance technique for video diffusion models, (2) eliminating the need for auxiliary models by simulating a weak model through layer skipping, and (3) ensuring quality-enhanced guidance without compromising sample diversity or dynamics unlike CFG. For additional results, visit https://junhahyung.github.io/STGuidance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18664v1-abstract-full').style.display = 'none'; document.getElementById('2411.18664v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">project page: https://junhahyung.github.io/STGuidance</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15783">arXiv:2411.15783</a> <span> [<a href="https://arxiv.org/pdf/2411.15783">pdf</a>, <a href="https://arxiv.org/format/2411.15783">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Limitations of Online Play Content for Parents of Infants and Toddlers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Park%2C+K">Keunwoo Park</a>, <a href="/search/cs?searchtype=author&query=Ahn%2C+S">Subin Ahn</a>, <a href="/search/cs?searchtype=author&query=Jung%2C+M">Mina Jung</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+Y+J">You Jung Cho</a>, <a href="/search/cs?searchtype=author&query=Jeong%2C+S">Seulah Jeong</a>, <a href="/search/cs?searchtype=author&query=Huh%2C+C">Cheong-Ah Huh</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15783v2-abstract-short" style="display: inline;"> Play is a fundamental aspect of developmental growth, yet many parents encounter significant challenges in fulfilling their caregiving roles in this area. As online content increasingly serves as the primary source of parental guidance, this study investigates the difficulties parents face related to play and evaluates the limitations of current online content. We identified ten findings through i… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15783v2-abstract-full').style.display = 'inline'; document.getElementById('2411.15783v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15783v2-abstract-full" style="display: none;"> Play is a fundamental aspect of developmental growth, yet many parents encounter significant challenges in fulfilling their caregiving roles in this area. As online content increasingly serves as the primary source of parental guidance, this study investigates the difficulties parents face related to play and evaluates the limitations of current online content. We identified ten findings through in-depth interviews with nine parents who reported struggles in engaging with their children during play. Based on these findings, we discuss the major limitations of online play content and suggest how they can be improved. These recommendations include minimizing parental anxiety, accommodating diverse play scenarios, providing credible and personalized information, encouraging creativity, and delivering the same content in multiple formats. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15783v2-abstract-full').style.display = 'none'; document.getElementById('2411.15783v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to HCI Korea 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15115">arXiv:2411.15115</a> <span> [<a href="https://arxiv.org/pdf/2411.15115">pdf</a>, <a href="https://arxiv.org/format/2411.15115">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> VideoRepair: Improving Text-to-Video Generation via Misalignment Evaluation and Localized Refinement </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lee%2C+D">Daeun Lee</a>, <a href="/search/cs?searchtype=author&query=Yoon%2C+J">Jaehong Yoon</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+J">Jaemin Cho</a>, <a href="/search/cs?searchtype=author&query=Bansal%2C+M">Mohit Bansal</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15115v1-abstract-short" style="display: inline;"> Recent text-to-video (T2V) diffusion models have demonstrated impressive generation capabilities across various domains. However, these models often generate videos that have misalignments with text prompts, especially when the prompts describe complex scenes with multiple objects and attributes. To address this, we introduce VideoRepair, a novel model-agnostic, training-free video refinement fram… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15115v1-abstract-full').style.display = 'inline'; document.getElementById('2411.15115v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15115v1-abstract-full" style="display: none;"> Recent text-to-video (T2V) diffusion models have demonstrated impressive generation capabilities across various domains. However, these models often generate videos that have misalignments with text prompts, especially when the prompts describe complex scenes with multiple objects and attributes. To address this, we introduce VideoRepair, a novel model-agnostic, training-free video refinement framework that automatically identifies fine-grained text-video misalignments and generates explicit spatial and textual feedback, enabling a T2V diffusion model to perform targeted, localized refinements. VideoRepair consists of four stages: In (1) video evaluation, we detect misalignments by generating fine-grained evaluation questions and answering those questions with MLLM. In (2) refinement planning, we identify accurately generated objects and then create localized prompts to refine other areas in the video. Next, in (3) region decomposition, we segment the correctly generated area using a combined grounding module. We regenerate the video by adjusting the misaligned regions while preserving the correct regions in (4) localized refinement. On two popular video generation benchmarks (EvalCrafter and T2V-CompBench), VideoRepair substantially outperforms recent baselines across various text-video alignment metrics. We provide a comprehensive analysis of VideoRepair components and qualitative examples. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15115v1-abstract-full').style.display = 'none'; document.getElementById('2411.15115v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://video-repair.github.io</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14042">arXiv:2411.14042</a> <span> [<a href="https://arxiv.org/pdf/2411.14042">pdf</a>, <a href="https://arxiv.org/format/2411.14042">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Forecasting Future International Events: A Reliable Dataset for Text-Based Event Modeling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gwak%2C+D">Daehoon Gwak</a>, <a href="/search/cs?searchtype=author&query=Park%2C+J">Junwoo Park</a>, <a href="/search/cs?searchtype=author&query=Park%2C+M">Minho Park</a>, <a href="/search/cs?searchtype=author&query=Park%2C+C">Chaehun Park</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+H">Hyunchan Lee</a>, <a href="/search/cs?searchtype=author&query=Choi%2C+E">Edward Choi</a>, <a href="/search/cs?searchtype=author&query=Choo%2C+J">Jaegul Choo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14042v1-abstract-short" style="display: inline;"> Predicting future international events from textual information, such as news articles, has tremendous potential for applications in global policy, strategic decision-making, and geopolitics. However, existing datasets available for this task are often limited in quality, hindering the progress of related research. In this paper, we introduce WORLDREP (WORLD Relationship and Event Prediction), a n… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14042v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14042v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14042v1-abstract-full" style="display: none;"> Predicting future international events from textual information, such as news articles, has tremendous potential for applications in global policy, strategic decision-making, and geopolitics. However, existing datasets available for this task are often limited in quality, hindering the progress of related research. In this paper, we introduce WORLDREP (WORLD Relationship and Event Prediction), a novel dataset designed to address these limitations by leveraging the advanced reasoning capabilities of large-language models (LLMs). Our dataset features high-quality scoring labels generated through advanced prompt modeling and rigorously validated by domain experts in political science. We showcase the quality and utility of WORLDREP for real-world event prediction tasks, demonstrating its effectiveness through extensive experiments and analysis. Furthermore, we publicly release our dataset along with the full automation source code for data collection, labeling, and benchmarking, aiming to support and advance research in text-based event prediction. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14042v1-abstract-full').style.display = 'none'; document.getElementById('2411.14042v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">EMNLP 2024 Findings</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13779">arXiv:2411.13779</a> <span> [<a href="https://arxiv.org/pdf/2411.13779">pdf</a>, <a href="https://arxiv.org/format/2411.13779">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> NewsInterview: a Dataset and a Playground to Evaluate LLMs' Ground Gap via Informational Interviews </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lu%2C+M">Michael Lu</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+H+J">Hyundong Justin Cho</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+W">Weiyan Shi</a>, <a href="/search/cs?searchtype=author&query=May%2C+J">Jonathan May</a>, <a href="/search/cs?searchtype=author&query=Spangher%2C+A">Alexander Spangher</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13779v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) have demonstrated impressive capabilities in generating coherent text but often struggle with grounding language and strategic dialogue. To address this gap, we focus on journalistic interviews, a domain rich in grounding communication and abundant in data. We curate a dataset of 40,000 two-person informational interviews from NPR and CNN, and reveal that LLMs are sign… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13779v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13779v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13779v1-abstract-full" style="display: none;"> Large Language Models (LLMs) have demonstrated impressive capabilities in generating coherent text but often struggle with grounding language and strategic dialogue. To address this gap, we focus on journalistic interviews, a domain rich in grounding communication and abundant in data. We curate a dataset of 40,000 two-person informational interviews from NPR and CNN, and reveal that LLMs are significantly less likely than human interviewers to use acknowledgements and to pivot to higher-level questions. Realizing that a fundamental deficit exists in multi-turn planning and strategic thinking, we develop a realistic simulated environment, incorporating source personas and persuasive elements, in order to facilitate the development of agents with longer-horizon rewards. Our experiments show that while source LLMs mimic human behavior in information sharing, interviewer LLMs struggle with recognizing when questions are answered and engaging persuasively, leading to suboptimal information extraction across model size and capability. These findings underscore the need for enhancing LLMs' strategic dialogue capabilities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13779v1-abstract-full').style.display = 'none'; document.getElementById('2411.13779v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06736">arXiv:2411.06736</a> <span> [<a href="https://arxiv.org/pdf/2411.06736">pdf</a>, <a href="https://arxiv.org/format/2411.06736">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> MrSteve: Instruction-Following Agents in Minecraft with What-Where-When Memory </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Park%2C+J">Junyeong Park</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+J">Junmo Cho</a>, <a href="/search/cs?searchtype=author&query=Ahn%2C+S">Sungjin Ahn</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06736v4-abstract-short" style="display: inline;"> Significant advances have been made in developing general-purpose embodied AI in environments like Minecraft through the adoption of LLM-augmented hierarchical approaches. While these approaches, which combine high-level planners with low-level controllers, show promise, low-level controllers frequently become performance bottlenecks due to repeated failures. In this paper, we argue that the prima… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06736v4-abstract-full').style.display = 'inline'; document.getElementById('2411.06736v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06736v4-abstract-full" style="display: none;"> Significant advances have been made in developing general-purpose embodied AI in environments like Minecraft through the adoption of LLM-augmented hierarchical approaches. While these approaches, which combine high-level planners with low-level controllers, show promise, low-level controllers frequently become performance bottlenecks due to repeated failures. In this paper, we argue that the primary cause of failure in many low-level controllers is the absence of an episodic memory system. To address this, we introduce MrSteve (Memory Recall Steve-1), a novel low-level controller equipped with Place Event Memory (PEM), a form of episodic memory that captures what, where, and when information from episodes. This directly addresses the main limitation of the popular low-level controller, Steve-1. Unlike previous models that rely on short-term memory, PEM organizes spatial and event-based data, enabling efficient recall and navigation in long-horizon tasks. Additionally, we propose an Exploration Strategy and a Memory-Augmented Task Solving Framework, allowing agents to alternate between exploration and task-solving based on recalled events. Our approach significantly improves task-solving and exploration efficiency compared to existing methods. We will release our code and demos on the project page: https://sites.google.com/view/mr-steve. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06736v4-abstract-full').style.display = 'none'; document.getElementById('2411.06736v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05001">arXiv:2411.05001</a> <span> [<a href="https://arxiv.org/pdf/2411.05001">pdf</a>, <a href="https://arxiv.org/format/2411.05001">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Analyzing The Language of Visual Tokens </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chan%2C+D+M">David M. Chan</a>, <a href="/search/cs?searchtype=author&query=Corona%2C+R">Rodolfo Corona</a>, <a href="/search/cs?searchtype=author&query=Park%2C+J">Joonyong Park</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+C+J">Cheol Jun Cho</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+Y">Yutong Bai</a>, <a href="/search/cs?searchtype=author&query=Darrell%2C+T">Trevor Darrell</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05001v1-abstract-short" style="display: inline;"> With the introduction of transformer-based models for vision and language tasks, such as LLaVA and Chameleon, there has been renewed interest in the discrete tokenized representation of images. These models often treat image patches as discrete tokens, analogous to words in natural language, learning joint alignments between visual and human languages. However, little is known about the statistica… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05001v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05001v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05001v1-abstract-full" style="display: none;"> With the introduction of transformer-based models for vision and language tasks, such as LLaVA and Chameleon, there has been renewed interest in the discrete tokenized representation of images. These models often treat image patches as discrete tokens, analogous to words in natural language, learning joint alignments between visual and human languages. However, little is known about the statistical behavior of these visual languages - whether they follow similar frequency distributions, grammatical structures, or topologies as natural languages. In this paper, we take a natural-language-centric approach to analyzing discrete visual languages and uncover striking similarities and fundamental differences. We demonstrate that, although visual languages adhere to Zipfian distributions, higher token innovation drives greater entropy and lower compression, with tokens predominantly representing object parts, indicating intermediate granularity. We also show that visual languages lack cohesive grammatical structures, leading to higher perplexity and weaker hierarchical organization compared to natural languages. Finally, we demonstrate that, while vision models align more closely with natural languages than other models, this alignment remains significantly weaker than the cohesion found within natural languages. Through these experiments, we demonstrate how understanding the statistical properties of discrete visual languages can inform the design of more effective computer vision models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05001v1-abstract-full').style.display = 'none'; document.getElementById('2411.05001v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04952">arXiv:2411.04952</a> <span> [<a href="https://arxiv.org/pdf/2411.04952">pdf</a>, <a href="https://arxiv.org/format/2411.04952">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> M3DocRAG: Multi-modal Retrieval is What You Need for Multi-page Multi-document Understanding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cho%2C+J">Jaemin Cho</a>, <a href="/search/cs?searchtype=author&query=Mahata%2C+D">Debanjan Mahata</a>, <a href="/search/cs?searchtype=author&query=Irsoy%2C+O">Ozan Irsoy</a>, <a href="/search/cs?searchtype=author&query=He%2C+Y">Yujie He</a>, <a href="/search/cs?searchtype=author&query=Bansal%2C+M">Mohit Bansal</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04952v1-abstract-short" style="display: inline;"> Document visual question answering (DocVQA) pipelines that answer questions from documents have broad applications. Existing methods focus on handling single-page documents with multi-modal language models (MLMs), or rely on text-based retrieval-augmented generation (RAG) that uses text extraction tools such as optical character recognition (OCR). However, there are difficulties in applying these… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04952v1-abstract-full').style.display = 'inline'; document.getElementById('2411.04952v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04952v1-abstract-full" style="display: none;"> Document visual question answering (DocVQA) pipelines that answer questions from documents have broad applications. Existing methods focus on handling single-page documents with multi-modal language models (MLMs), or rely on text-based retrieval-augmented generation (RAG) that uses text extraction tools such as optical character recognition (OCR). However, there are difficulties in applying these methods in real-world scenarios: (a) questions often require information across different pages or documents, where MLMs cannot handle many long documents; (b) documents often have important information in visual elements such as figures, but text extraction tools ignore them. We introduce M3DocRAG, a novel multi-modal RAG framework that flexibly accommodates various document contexts (closed-domain and open-domain), question hops (single-hop and multi-hop), and evidence modalities (text, chart, figure, etc.). M3DocRAG finds relevant documents and answers questions using a multi-modal retriever and an MLM, so that it can efficiently handle single or many documents while preserving visual information. Since previous DocVQA datasets ask questions in the context of a specific document, we also present M3DocVQA, a new benchmark for evaluating open-domain DocVQA over 3,000+ PDF documents with 40,000+ pages. In three benchmarks (M3DocVQA/MMLongBench-Doc/MP-DocVQA), empirical results show that M3DocRAG with ColPali and Qwen2-VL 7B achieves superior performance than many strong baselines, including state-of-the-art performance in MP-DocVQA. We provide comprehensive analyses of different indexing, MLMs, and retrieval models. Lastly, we qualitatively show that M3DocRAG can successfully handle various scenarios, such as when relevant information exists across multiple pages and when answer evidence only exists in images. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04952v1-abstract-full').style.display = 'none'; document.getElementById('2411.04952v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project webpage: https://m3docrag.github.io</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00686">arXiv:2411.00686</a> <span> [<a href="https://arxiv.org/pdf/2411.00686">pdf</a>, <a href="https://arxiv.org/format/2411.00686">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Latent Paraphrasing: Perturbation on Layers Improves Knowledge Injection in Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kang%2C+M">Minki Kang</a>, <a href="/search/cs?searchtype=author&query=Hwang%2C+S+J">Sung Ju Hwang</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+G">Gibbeum Lee</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+J">Jaewoong Cho</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00686v1-abstract-short" style="display: inline;"> As Large Language Models (LLMs) are increasingly deployed in specialized domains with continuously evolving knowledge, the need for timely and precise knowledge injection has become essential. Fine-tuning with paraphrased data is a common approach to enhance knowledge injection, yet it faces two significant challenges: high computational costs due to repetitive external model usage and limited sam… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00686v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00686v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00686v1-abstract-full" style="display: none;"> As Large Language Models (LLMs) are increasingly deployed in specialized domains with continuously evolving knowledge, the need for timely and precise knowledge injection has become essential. Fine-tuning with paraphrased data is a common approach to enhance knowledge injection, yet it faces two significant challenges: high computational costs due to repetitive external model usage and limited sample diversity. To this end, we introduce LaPael, a latent-level paraphrasing method that applies input-dependent noise to early LLM layers. This approach enables diverse and semantically consistent augmentations directly within the model. Furthermore, it eliminates the recurring costs of paraphrase generation for each knowledge update. Our extensive experiments on question-answering benchmarks demonstrate that LaPael improves knowledge injection over standard fine-tuning and existing noise-based approaches. Additionally, combining LaPael with data-level paraphrasing further enhances performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00686v1-abstract-full').style.display = 'none'; document.getElementById('2411.00686v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.22891">arXiv:2410.22891</a> <span> [<a href="https://arxiv.org/pdf/2410.22891">pdf</a>, <a href="https://arxiv.org/format/2410.22891">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> VPO: Leveraging the Number of Votes in Preference Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cho%2C+J+H">Jae Hyeon Cho</a>, <a href="/search/cs?searchtype=author&query=Park%2C+M">Minkyung Park</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+B">Byung-Jun Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.22891v1-abstract-short" style="display: inline;"> Direct Preference Optimization (DPO) trains a language model using human preference data, bypassing the explicit reward modeling phase of Reinforcement Learning from Human Feedback (RLHF). By iterating over sentence pairs in a preference dataset, DPO enhances generation quality by increasing the likelihood of producing preferred sentences over less favored ones. Preference datasets are typically c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22891v1-abstract-full').style.display = 'inline'; document.getElementById('2410.22891v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.22891v1-abstract-full" style="display: none;"> Direct Preference Optimization (DPO) trains a language model using human preference data, bypassing the explicit reward modeling phase of Reinforcement Learning from Human Feedback (RLHF). By iterating over sentence pairs in a preference dataset, DPO enhances generation quality by increasing the likelihood of producing preferred sentences over less favored ones. Preference datasets are typically created by selecting preferred sentences through a voting process involving multiple individuals, as opinions can vary due to the subjective nature of human preferences. While the number of votes offers insight into whether a sentence pair is clearly preferable or controversial, current methods do not fully leverage this information. In this paper, we introduce a technique that leverages user voting data to better align with diverse subjective preferences. We employ the Bayesian Minimum Mean Square Error (Bayesian MMSE) estimator to model the probability that one generation is preferable to another. Using this estimated probability as a target, we develop the Vote-based Preference Optimization (VPO) framework, which incorporates the number of votes on both sides to distinguish between controversial and obvious generation pairs. We show that previous algorithms, such as DPO and Identity Preference Optimization (IPO), can be extended using the proposed framework, termed VDPO and VIPO. Our experiments demonstrate that these proposed algorithms outperform various existing methods, including their base algorithms. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22891v1-abstract-full').style.display = 'none'; document.getElementById('2410.22891v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.22376">arXiv:2410.22376</a> <span> [<a href="https://arxiv.org/pdf/2410.22376">pdf</a>, <a href="https://arxiv.org/format/2410.22376">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Rare-to-Frequent: Unlocking Compositional Generation Power of Diffusion Models on Rare Concepts with LLM Guidance </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Park%2C+D">Dongmin Park</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+S">Sebin Kim</a>, <a href="/search/cs?searchtype=author&query=Moon%2C+T">Taehong Moon</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+M">Minkyu Kim</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+K">Kangwook Lee</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+J">Jaewoong Cho</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.22376v2-abstract-short" style="display: inline;"> State-of-the-art text-to-image (T2I) diffusion models often struggle to generate rare compositions of concepts, e.g., objects with unusual attributes. In this paper, we show that the compositional generation power of diffusion models on such rare concepts can be significantly enhanced by the Large Language Model (LLM) guidance. We start with empirical and theoretical analysis, demonstrating that e… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22376v2-abstract-full').style.display = 'inline'; document.getElementById('2410.22376v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.22376v2-abstract-full" style="display: none;"> State-of-the-art text-to-image (T2I) diffusion models often struggle to generate rare compositions of concepts, e.g., objects with unusual attributes. In this paper, we show that the compositional generation power of diffusion models on such rare concepts can be significantly enhanced by the Large Language Model (LLM) guidance. We start with empirical and theoretical analysis, demonstrating that exposing frequent concepts relevant to the target rare concepts during the diffusion sampling process yields more accurate concept composition. Based on this, we propose a training-free approach, R2F, that plans and executes the overall rare-to-frequent concept guidance throughout the diffusion inference by leveraging the abundant semantic knowledge in LLMs. Our framework is flexible across any pre-trained diffusion models and LLMs, and can be seamlessly integrated with the region-guided diffusion approaches. Extensive experiments on three datasets, including our newly proposed benchmark, RareBench, containing various prompts with rare compositions of concepts, R2F significantly surpasses existing models including SD3.0 and FLUX by up to 28.1%p in T2I alignment. Code is available at https://github.com/krafton-ai/Rare-to-Frequent. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22376v2-abstract-full').style.display = 'none'; document.getElementById('2410.22376v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.21826">arXiv:2410.21826</a> <span> [<a href="https://arxiv.org/pdf/2410.21826">pdf</a>, <a href="https://arxiv.org/format/2410.21826">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Volumetric Conditioning Module to Control Pretrained Diffusion Models for 3D Medical Images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ahn%2C+S">Suhyun Ahn</a>, <a href="/search/cs?searchtype=author&query=Park%2C+W">Wonjung Park</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+J">Jihoon Cho</a>, <a href="/search/cs?searchtype=author&query=Park%2C+S">Seunghyuck Park</a>, <a href="/search/cs?searchtype=author&query=Park%2C+J">Jinah Park</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.21826v1-abstract-short" style="display: inline;"> Spatial control methods using additional modules on pretrained diffusion models have gained attention for enabling conditional generation in natural images. These methods guide the generation process with new conditions while leveraging the capabilities of large models. They could be beneficial as training strategies in the context of 3D medical imaging, where training a diffusion model from scrat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21826v1-abstract-full').style.display = 'inline'; document.getElementById('2410.21826v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.21826v1-abstract-full" style="display: none;"> Spatial control methods using additional modules on pretrained diffusion models have gained attention for enabling conditional generation in natural images. These methods guide the generation process with new conditions while leveraging the capabilities of large models. They could be beneficial as training strategies in the context of 3D medical imaging, where training a diffusion model from scratch is challenging due to high computational costs and data scarcity. However, the potential application of spatial control methods with additional modules to 3D medical images has not yet been explored. In this paper, we present a tailored spatial control method for 3D medical images with a novel lightweight module, Volumetric Conditioning Module (VCM). Our VCM employs an asymmetric U-Net architecture to effectively encode complex information from various levels of 3D conditions, providing detailed guidance in image synthesis. To examine the applicability of spatial control methods and the effectiveness of VCM for 3D medical data, we conduct experiments under single- and multimodal conditions scenarios across a wide range of dataset sizes, from extremely small datasets with 10 samples to large datasets with 500 samples. The experimental results show that the VCM is effective for conditional generation and efficient in terms of requiring less training data and computational resources. We further investigate the potential applications for our spatial control method through axial super-resolution for medical images. Our code is available at \url{https://github.com/Ahn-Ssu/VCM} <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21826v1-abstract-full').style.display = 'none'; document.getElementById('2410.21826v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages, 18 figures, accepted @ WACV 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20811">arXiv:2410.20811</a> <span> [<a href="https://arxiv.org/pdf/2410.20811">pdf</a>, <a href="https://arxiv.org/format/2410.20811">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Bridging the Gap between Expert and Language Models: Concept-guided Chess Commentary Generation and Evaluation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kim%2C+J">Jaechang Kim</a>, <a href="/search/cs?searchtype=author&query=Goh%2C+J">Jinmin Goh</a>, <a href="/search/cs?searchtype=author&query=Hwang%2C+I">Inseok Hwang</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+J">Jaewoong Cho</a>, <a href="/search/cs?searchtype=author&query=Ok%2C+J">Jungseul Ok</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20811v2-abstract-short" style="display: inline;"> Deep learning-based expert models have reached superhuman performance in decision-making domains such as chess and Go. However, it is under-explored to explain or comment on given decisions although it is important for model explainability and human education. The outputs of expert models are accurate, but yet difficult to interpret for humans. On the other hand, large language models (LLMs) can p… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20811v2-abstract-full').style.display = 'inline'; document.getElementById('2410.20811v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20811v2-abstract-full" style="display: none;"> Deep learning-based expert models have reached superhuman performance in decision-making domains such as chess and Go. However, it is under-explored to explain or comment on given decisions although it is important for model explainability and human education. The outputs of expert models are accurate, but yet difficult to interpret for humans. On the other hand, large language models (LLMs) can produce fluent commentary but are prone to hallucinations due to their limited decision-making capabilities. To bridge this gap between expert models and LLMs, we focus on chess commentary as a representative task of explaining complex decision-making processes through language and address both the generation and evaluation of commentary. We introduce Concept-guided Chess Commentary generation (CCC) for producing commentary and GPT-based Chess Commentary Evaluation (GCC-Eval) for assessing it. CCC integrates the decision-making strengths of expert models with the linguistic fluency of LLMs through prioritized, concept-based explanations. GCC-Eval leverages expert knowledge to evaluate chess commentary based on informativeness and linguistic quality. Experimental results, validated by both human judges and GCC-Eval, demonstrate that CCC generates commentary which is accurate, informative, and fluent. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20811v2-abstract-full').style.display = 'none'; document.getElementById('2410.20811v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Appears in NAACL 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20199">arXiv:2410.20199</a> <span> [<a href="https://arxiv.org/pdf/2410.20199">pdf</a>, <a href="https://arxiv.org/format/2410.20199">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Rethinking the Uncertainty: A Critical Review and Analysis in the Era of Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Beigi%2C+M">Mohammad Beigi</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Sijia Wang</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+Y">Ying Shen</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Z">Zihao Lin</a>, <a href="/search/cs?searchtype=author&query=Kulkarni%2C+A">Adithya Kulkarni</a>, <a href="/search/cs?searchtype=author&query=He%2C+J">Jianfeng He</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+F">Feng Chen</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+M">Ming Jin</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+J">Jin-Hee Cho</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+D">Dawei Zhou</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+C">Chang-Tien Lu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+L">Lifu Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20199v1-abstract-short" style="display: inline;"> In recent years, Large Language Models (LLMs) have become fundamental to a broad spectrum of artificial intelligence applications. As the use of LLMs expands, precisely estimating the uncertainty in their predictions has become crucial. Current methods often struggle to accurately identify, measure, and address the true uncertainty, with many focusing primarily on estimating model confidence. This… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20199v1-abstract-full').style.display = 'inline'; document.getElementById('2410.20199v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20199v1-abstract-full" style="display: none;"> In recent years, Large Language Models (LLMs) have become fundamental to a broad spectrum of artificial intelligence applications. As the use of LLMs expands, precisely estimating the uncertainty in their predictions has become crucial. Current methods often struggle to accurately identify, measure, and address the true uncertainty, with many focusing primarily on estimating model confidence. This discrepancy is largely due to an incomplete understanding of where, when, and how uncertainties are injected into models. This paper introduces a comprehensive framework specifically designed to identify and understand the types and sources of uncertainty, aligned with the unique characteristics of LLMs. Our framework enhances the understanding of the diverse landscape of uncertainties by systematically categorizing and defining each type, establishing a solid foundation for developing targeted methods that can precisely quantify these uncertainties. We also provide a detailed introduction to key related concepts and examine the limitations of current methods in mission-critical and safety-sensitive applications. The paper concludes with a perspective on future directions aimed at enhancing the reliability and practical adoption of these methods in real-world scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20199v1-abstract-full').style.display = 'none'; document.getElementById('2410.20199v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18652">arXiv:2410.18652</a> <span> [<a href="https://arxiv.org/pdf/2410.18652">pdf</a>, <a href="https://arxiv.org/format/2410.18652">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> $C^2$: Scalable Auto-Feedback for LLM-based Chart Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Koh%2C+W">Woosung Koh</a>, <a href="/search/cs?searchtype=author&query=Yoon%2C+J+H">Jang Han Yoon</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+M">MinHyung Lee</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Y">Youngjin Song</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+J">Jaegwan Cho</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+J">Jaehyun Kang</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+T">Taehyeon Kim</a>, <a href="/search/cs?searchtype=author&query=Yun%2C+S">Se-Young Yun</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Y">Youngjae Yu</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+B">Bongshin Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18652v7-abstract-short" style="display: inline;"> Generating high-quality charts with Large Language Models (LLMs) presents significant challenges due to limited data and the high cost of scaling through human curation. $\langle \text{instruction}, \text{data}, \text{code} \rangle$ triplets are scarce and expensive to manually curate as their creation demands technical expertise. To address this scalability challenge, we introduce a reference-fre… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18652v7-abstract-full').style.display = 'inline'; document.getElementById('2410.18652v7-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18652v7-abstract-full" style="display: none;"> Generating high-quality charts with Large Language Models (LLMs) presents significant challenges due to limited data and the high cost of scaling through human curation. $\langle \text{instruction}, \text{data}, \text{code} \rangle$ triplets are scarce and expensive to manually curate as their creation demands technical expertise. To address this scalability challenge, we introduce a reference-free automatic feedback generator, which eliminates the need for costly human intervention. Our novel framework, C$^2$, consists of (1) an automatic feedback provider (ChartAF) and (2) a diverse, reference-free dataset (ChartUIE-8K). The results are compelling: in our first experiment, 74% of respondents strongly preferred, and 10% preferred, the results after feedback. The second post-feedback experiment demonstrates that ChartAF outperform nine baselines. Moreover, ChartUIE-8K significantly improves data diversity by increasing queries, datasets, and chart types by 5982%, 1936%, and 91%, respectively, over benchmarks. Finally, a study of LLM users revealed that 94% of participants preferred ChartUIE-8K's queries, with 93% deeming them aligned with real-world use cases. Core contributions are available as open-source at chartsquared.github.io, with ample qualitative examples. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18652v7-abstract-full').style.display = 'none'; document.getElementById('2410.18652v7-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NAACL 2025 Main (Long)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18444">arXiv:2410.18444</a> <span> [<a href="https://arxiv.org/pdf/2410.18444">pdf</a>, <a href="https://arxiv.org/format/2410.18444">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Evaluating and Improving Automatic Speech Recognition Systems for Korean Meteorological Experts </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Park%2C+C">ChaeHun Park</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+H">Hojun Cho</a>, <a href="/search/cs?searchtype=author&query=Choo%2C+J">Jaegul Choo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18444v1-abstract-short" style="display: inline;"> This paper explores integrating Automatic Speech Recognition (ASR) into natural language query systems to improve weather forecasting efficiency for Korean meteorologists. We address challenges in developing ASR systems for the Korean weather domain, specifically specialized vocabulary and Korean linguistic intricacies. To tackle these issues, we constructed an evaluation dataset of spoken queries… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18444v1-abstract-full').style.display = 'inline'; document.getElementById('2410.18444v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18444v1-abstract-full" style="display: none;"> This paper explores integrating Automatic Speech Recognition (ASR) into natural language query systems to improve weather forecasting efficiency for Korean meteorologists. We address challenges in developing ASR systems for the Korean weather domain, specifically specialized vocabulary and Korean linguistic intricacies. To tackle these issues, we constructed an evaluation dataset of spoken queries recorded by native Korean speakers. Using this dataset, we assessed various configurations of a multilingual ASR model family, identifying performance limitations related to domain-specific terminology. We then implemented a simple text-to-speech-based data augmentation method, which improved the recognition of specialized terms while maintaining general-domain performance. Our contributions include creating a domain-specific dataset, comprehensive ASR model evaluations, and an effective augmentation technique. We believe our work provides a foundation for future advancements in ASR for the Korean weather forecasting domain. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18444v1-abstract-full').style.display = 'none'; document.getElementById('2410.18444v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.13598">arXiv:2410.13598</a> <span> [<a href="https://arxiv.org/pdf/2410.13598">pdf</a>, <a href="https://arxiv.org/format/2410.13598">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Let Me Finish My Sentence: Video Temporal Grounding with Holistic Text Understanding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Woo%2C+J">Jongbhin Woo</a>, <a href="/search/cs?searchtype=author&query=Ryu%2C+H">Hyeonggon Ryu</a>, <a href="/search/cs?searchtype=author&query=Jang%2C+Y">Youngjoon Jang</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+J+W">Jae Won Cho</a>, <a href="/search/cs?searchtype=author&query=Chung%2C+J+S">Joon Son Chung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.13598v1-abstract-short" style="display: inline;"> Video Temporal Grounding (VTG) aims to identify visual frames in a video clip that match text queries. Recent studies in VTG employ cross-attention to correlate visual frames and text queries as individual token sequences. However, these approaches overlook a crucial aspect of the problem: a holistic understanding of the query sentence. A model may capture correlations between individual word toke… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13598v1-abstract-full').style.display = 'inline'; document.getElementById('2410.13598v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.13598v1-abstract-full" style="display: none;"> Video Temporal Grounding (VTG) aims to identify visual frames in a video clip that match text queries. Recent studies in VTG employ cross-attention to correlate visual frames and text queries as individual token sequences. However, these approaches overlook a crucial aspect of the problem: a holistic understanding of the query sentence. A model may capture correlations between individual word tokens and arbitrary visual frames while possibly missing out on the global meaning. To address this, we introduce two primary contributions: (1) a visual frame-level gate mechanism that incorporates holistic textual information, (2) cross-modal alignment loss to learn the fine-grained correlation between query and relevant frames. As a result, we regularize the effect of individual word tokens and suppress irrelevant visual frames. We demonstrate that our method outperforms state-of-the-art approaches in VTG benchmarks, indicating that holistic text understanding guides the model to focus on the semantically important parts within the video. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13598v1-abstract-full').style.display = 'none'; document.getElementById('2410.13598v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ACMMM 24</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.13564">arXiv:2410.13564</a> <span> [<a href="https://arxiv.org/pdf/2410.13564">pdf</a>, <a href="https://arxiv.org/format/2410.13564">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Generative Location Modeling for Spatially Aware Object Insertion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yun%2C+J">Jooyeol Yun</a>, <a href="/search/cs?searchtype=author&query=Abati%2C+D">Davide Abati</a>, <a href="/search/cs?searchtype=author&query=Omran%2C+M">Mohamed Omran</a>, <a href="/search/cs?searchtype=author&query=Choo%2C+J">Jaegul Choo</a>, <a href="/search/cs?searchtype=author&query=Habibian%2C+A">Amirhossein Habibian</a>, <a href="/search/cs?searchtype=author&query=Wiggers%2C+A">Auke Wiggers</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.13564v1-abstract-short" style="display: inline;"> Generative models have become a powerful tool for image editing tasks, including object insertion. However, these methods often lack spatial awareness, generating objects with unrealistic locations and scales, or unintentionally altering the scene background. A key challenge lies in maintaining visual coherence, which requires both a geometrically suitable object location and a high-quality image… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13564v1-abstract-full').style.display = 'inline'; document.getElementById('2410.13564v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.13564v1-abstract-full" style="display: none;"> Generative models have become a powerful tool for image editing tasks, including object insertion. However, these methods often lack spatial awareness, generating objects with unrealistic locations and scales, or unintentionally altering the scene background. A key challenge lies in maintaining visual coherence, which requires both a geometrically suitable object location and a high-quality image edit. In this paper, we focus on the former, creating a location model dedicated to identifying realistic object locations. Specifically, we train an autoregressive model that generates bounding box coordinates, conditioned on the background image and the desired object class. This formulation allows to effectively handle sparse placement annotations and to incorporate implausible locations into a preference dataset by performing direct preference optimization. Our extensive experiments demonstrate that our generative location model, when paired with an inpainting method, substantially outperforms state-of-the-art instruction-tuned models and location modeling baselines in object insertion tasks, delivering accurate and visually coherent results. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13564v1-abstract-full').style.display = 'none'; document.getElementById('2410.13564v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.13274">arXiv:2410.13274</a> <span> [<a href="https://arxiv.org/pdf/2410.13274">pdf</a>, <a href="https://arxiv.org/format/2410.13274">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Breaking Chains: Unraveling the Links in Multi-Hop Knowledge Unlearning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Choi%2C+M">Minseok Choi</a>, <a href="/search/cs?searchtype=author&query=Park%2C+C">ChaeHun Park</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+D">Dohyun Lee</a>, <a href="/search/cs?searchtype=author&query=Choo%2C+J">Jaegul Choo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.13274v1-abstract-short" style="display: inline;"> Large language models (LLMs) serve as giant information stores, often including personal or copyrighted data, and retraining them from scratch is not a viable option. This has led to the development of various fast, approximate unlearning techniques to selectively remove knowledge from LLMs. Prior research has largely focused on minimizing the probabilities of specific token sequences by reversing… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13274v1-abstract-full').style.display = 'inline'; document.getElementById('2410.13274v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.13274v1-abstract-full" style="display: none;"> Large language models (LLMs) serve as giant information stores, often including personal or copyrighted data, and retraining them from scratch is not a viable option. This has led to the development of various fast, approximate unlearning techniques to selectively remove knowledge from LLMs. Prior research has largely focused on minimizing the probabilities of specific token sequences by reversing the language modeling objective. However, these methods still leave LLMs vulnerable to adversarial attacks that exploit indirect references. In this work, we examine the limitations of current unlearning techniques in effectively erasing a particular type of indirect prompt: multi-hop queries. Our findings reveal that existing methods fail to completely remove multi-hop knowledge when one of the intermediate hops is unlearned. To address this issue, we propose MUNCH, a simple uncertainty-based approach that breaks down multi-hop queries into subquestions and leverages the uncertainty of the unlearned model in final decision-making. Empirical results demonstrate the effectiveness of our framework, and MUNCH can be easily integrated with existing unlearning techniques, making it a flexible and useful solution for enhancing unlearning processes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13274v1-abstract-full').style.display = 'none'; document.getElementById('2410.13274v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">16 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.12692">arXiv:2410.12692</a> <span> [<a href="https://arxiv.org/pdf/2410.12692">pdf</a>, <a href="https://arxiv.org/format/2410.12692">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Machine learning approach to brain tumor detection and classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Oh%2C+A">Alice Oh</a>, <a href="/search/cs?searchtype=author&query=Noh%2C+I">Inyoung Noh</a>, <a href="/search/cs?searchtype=author&query=Choo%2C+J">Jian Choo</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+J">Jihoo Lee</a>, <a href="/search/cs?searchtype=author&query=Park%2C+J">Justin Park</a>, <a href="/search/cs?searchtype=author&query=Hwang%2C+K">Kate Hwang</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+S">Sanghyeon Kim</a>, <a href="/search/cs?searchtype=author&query=Oh%2C+S+M">Soo Min Oh</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.12692v2-abstract-short" style="display: inline;"> Brain tumor detection and classification are critical tasks in medical image analysis, particularly in early-stage diagnosis, where accurate and timely detection can significantly improve treatment outcomes. In this study, we apply various statistical and machine learning models to detect and classify brain tumors using brain MRI images. We explore a variety of statistical models including linear,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12692v2-abstract-full').style.display = 'inline'; document.getElementById('2410.12692v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.12692v2-abstract-full" style="display: none;"> Brain tumor detection and classification are critical tasks in medical image analysis, particularly in early-stage diagnosis, where accurate and timely detection can significantly improve treatment outcomes. In this study, we apply various statistical and machine learning models to detect and classify brain tumors using brain MRI images. We explore a variety of statistical models including linear, logistic, and Bayesian regressions, and the machine learning models including decision tree, random forest, single-layer perceptron, multi-layer perceptron, convolutional neural network (CNN), recurrent neural network, and long short-term memory. Our findings show that CNN outperforms other models, achieving the best performance. Additionally, we confirm that the CNN model can also work for multi-class classification, distinguishing between four categories of brain MRI images such as normal, glioma, meningioma, and pituitary tumor images. This study demonstrates that machine learning approaches are suitable for brain tumor detection and classification, facilitating real-world medical applications in assisting radiologists with early and accurate diagnosis. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12692v2-abstract-full').style.display = 'none'; document.getElementById('2410.12692v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages, 2 figures, 2 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.12228">arXiv:2410.12228</a> <span> [<a href="https://arxiv.org/pdf/2410.12228">pdf</a>, <a href="https://arxiv.org/format/2410.12228">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Triple Modality Fusion: Aligning Visual, Textual, and Graph Data with Large Language Models for Multi-Behavior Recommendations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ma%2C+L">Luyi Ma</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiaohan Li</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+Z">Zezhong Fan</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jianpeng Xu</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+J">Jason Cho</a>, <a href="/search/cs?searchtype=author&query=Kanumala%2C+P">Praveen Kanumala</a>, <a href="/search/cs?searchtype=author&query=Nag%2C+K">Kaushiki Nag</a>, <a href="/search/cs?searchtype=author&query=Kumar%2C+S">Sushant Kumar</a>, <a href="/search/cs?searchtype=author&query=Achan%2C+K">Kannan Achan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.12228v1-abstract-short" style="display: inline;"> Integrating diverse data modalities is crucial for enhancing the performance of personalized recommendation systems. Traditional models, which often rely on singular data sources, lack the depth needed to accurately capture the multifaceted nature of item features and user behaviors. This paper introduces a novel framework for multi-behavior recommendations, leveraging the fusion of triple-modalit… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12228v1-abstract-full').style.display = 'inline'; document.getElementById('2410.12228v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.12228v1-abstract-full" style="display: none;"> Integrating diverse data modalities is crucial for enhancing the performance of personalized recommendation systems. Traditional models, which often rely on singular data sources, lack the depth needed to accurately capture the multifaceted nature of item features and user behaviors. This paper introduces a novel framework for multi-behavior recommendations, leveraging the fusion of triple-modality, which is visual, textual, and graph data through alignment with large language models (LLMs). By incorporating visual information, we capture contextual and aesthetic item characteristics; textual data provides insights into user interests and item features in detail; and graph data elucidates relationships within the item-behavior heterogeneous graphs. Our proposed model called Triple Modality Fusion (TMF) utilizes the power of LLMs to align and integrate these three modalities, achieving a comprehensive representation of user behaviors. The LLM models the user's interactions including behaviors and item features in natural languages. Initially, the LLM is warmed up using only natural language-based prompts. We then devise the modality fusion module based on cross-attention and self-attention mechanisms to integrate different modalities from other models into the same embedding space and incorporate them into an LLM. Extensive experiments demonstrate the effectiveness of our approach in improving recommendation accuracy. Further ablation studies validate the effectiveness of our model design and benefits of the TMF. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12228v1-abstract-full').style.display = 'none'; document.getElementById('2410.12228v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.11682">arXiv:2410.11682</a> <span> [<a href="https://arxiv.org/pdf/2410.11682">pdf</a>, <a href="https://arxiv.org/format/2410.11682">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SurFhead: Affine Rig Blending for Geometrically Accurate 2D Gaussian Surfel Head Avatars </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lee%2C+J">Jaeseong Lee</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+T">Taewoong Kang</a>, <a href="/search/cs?searchtype=author&query=B%C3%BChler%2C+M+C">Marcel C. B眉hler</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+M">Min-Jung Kim</a>, <a href="/search/cs?searchtype=author&query=Hwang%2C+S">Sungwon Hwang</a>, <a href="/search/cs?searchtype=author&query=Hyung%2C+J">Junha Hyung</a>, <a href="/search/cs?searchtype=author&query=Jang%2C+H">Hyojin Jang</a>, <a href="/search/cs?searchtype=author&query=Choo%2C+J">Jaegul Choo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.11682v1-abstract-short" style="display: inline;"> Recent advancements in head avatar rendering using Gaussian primitives have achieved significantly high-fidelity results. Although precise head geometry is crucial for applications like mesh reconstruction and relighting, current methods struggle to capture intricate geometric details and render unseen poses due to their reliance on similarity transformations, which cannot handle stretch and shear… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11682v1-abstract-full').style.display = 'inline'; document.getElementById('2410.11682v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.11682v1-abstract-full" style="display: none;"> Recent advancements in head avatar rendering using Gaussian primitives have achieved significantly high-fidelity results. Although precise head geometry is crucial for applications like mesh reconstruction and relighting, current methods struggle to capture intricate geometric details and render unseen poses due to their reliance on similarity transformations, which cannot handle stretch and shear transforms essential for detailed deformations of geometry. To address this, we propose SurFhead, a novel method that reconstructs riggable head geometry from RGB videos using 2D Gaussian surfels, which offer well-defined geometric properties, such as precise depth from fixed ray intersections and normals derived from their surface orientation, making them advantageous over 3D counterparts. SurFhead ensures high-fidelity rendering of both normals and images, even in extreme poses, by leveraging classical mesh-based deformation transfer and affine transformation interpolation. SurFhead introduces precise geometric deformation and blends surfels through polar decomposition of transformations, including those affecting normals. Our key contribution lies in bridging classical graphics techniques, such as mesh-based deformation, with modern Gaussian primitives, achieving state-of-the-art geometry reconstruction and rendering quality. Unlike previous avatar rendering approaches, SurFhead enables efficient reconstruction driven by Gaussian primitives while preserving high-fidelity geometry. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11682v1-abstract-full').style.display = 'none'; document.getElementById('2410.11682v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.11536">arXiv:2410.11536</a> <span> [<a href="https://arxiv.org/pdf/2410.11536">pdf</a>, <a href="https://arxiv.org/format/2410.11536">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Overcoming Domain Limitations in Open-vocabulary Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hwang%2C+D">Dongjun Hwang</a>, <a href="/search/cs?searchtype=author&query=Oh%2C+S+J">Seong Joon Oh</a>, <a href="/search/cs?searchtype=author&query=Choe%2C+J">Junsuk Choe</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.11536v1-abstract-short" style="display: inline;"> Open-vocabulary segmentation (OVS) has gained attention for its ability to recognize a broader range of classes. However, OVS models show significant performance drops when applied to unseen domains beyond the previous training dataset. Fine-tuning these models on new datasets can improve performance, but often leads to the catastrophic forgetting of previously learned knowledge. To address this i… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11536v1-abstract-full').style.display = 'inline'; document.getElementById('2410.11536v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.11536v1-abstract-full" style="display: none;"> Open-vocabulary segmentation (OVS) has gained attention for its ability to recognize a broader range of classes. However, OVS models show significant performance drops when applied to unseen domains beyond the previous training dataset. Fine-tuning these models on new datasets can improve performance, but often leads to the catastrophic forgetting of previously learned knowledge. To address this issue, we propose a method that allows OVS models to learn information from new domains while preserving prior knowledge. Our approach begins by evaluating the input sample's proximity to multiple domains, using precomputed multivariate normal distributions for each domain. Based on this prediction, we dynamically interpolate between the weights of the pre-trained decoder and the fine-tuned decoders. Extensive experiments demonstrate that this approach allows OVS models to adapt to new domains while maintaining performance on the previous training dataset. The source code is available at https://github.com/dongjunhwang/dwi. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11536v1-abstract-full').style.display = 'none'; document.getElementById('2410.11536v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.10269">arXiv:2410.10269</a> <span> [<a href="https://arxiv.org/pdf/2410.10269">pdf</a>, <a href="https://arxiv.org/format/2410.10269">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Two-Stage Approach for Brain MR Image Synthesis: 2D Image Synthesis and 3D Refinement </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cho%2C+J">Jihoon Cho</a>, <a href="/search/cs?searchtype=author&query=Park%2C+S">Seunghyuck Park</a>, <a href="/search/cs?searchtype=author&query=Park%2C+J">Jinah Park</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.10269v2-abstract-short" style="display: inline;"> Despite significant advancements in automatic brain tumor segmentation methods, their performance is not guaranteed when certain MR sequences are missing. Addressing this issue, it is crucial to synthesize the missing MR images that reflect the unique characteristics of the absent modality with precise tumor representation. Typically, MRI synthesis methods generate partial images rather than full-… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10269v2-abstract-full').style.display = 'inline'; document.getElementById('2410.10269v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.10269v2-abstract-full" style="display: none;"> Despite significant advancements in automatic brain tumor segmentation methods, their performance is not guaranteed when certain MR sequences are missing. Addressing this issue, it is crucial to synthesize the missing MR images that reflect the unique characteristics of the absent modality with precise tumor representation. Typically, MRI synthesis methods generate partial images rather than full-sized volumes due to computational constraints. This limitation can lead to a lack of comprehensive 3D volumetric information and result in image artifacts during the merging process. In this paper, we propose a two-stage approach that first synthesizes MR images from 2D slices using a novel intensity encoding method and then refines the synthesized MRI. The proposed intensity encoding reduces artifacts when synthesizing MRI on a 2D slice basis. Then, the \textit{Refiner}, which leverages complete 3D volume information, further improves the quality of the synthesized images and enhances their applicability to segmentation methods. Experimental results demonstrate that the intensity encoding effectively minimizes artifacts in the synthesized MRI and improves perceptual quality. Furthermore, using the \textit{Refiner} on synthesized MRI significantly improves brain tumor segmentation results, highlighting the potential of our approach in practical applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10269v2-abstract-full').style.display = 'none'; document.getElementById('2410.10269v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">MICCAI 2024 BraSyn Challenge 1st place</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.10149">arXiv:2410.10149</a> <span> [<a href="https://arxiv.org/pdf/2410.10149">pdf</a>, <a href="https://arxiv.org/format/2410.10149">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Fast and Accurate Neural Rendering Using Semi-Gradients </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cho%2C+I">In-Young Cho</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+J">Jaewoong Cho</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.10149v1-abstract-short" style="display: inline;"> We propose a simple yet effective neural network-based framework for global illumination rendering. Recently, rendering techniques that learn neural radiance caches by minimizing the difference (i.e., residual) between the left and right sides of the rendering equation have been suggested. Due to their ease of implementation and the advantage of excluding path integral calculations, these techniqu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10149v1-abstract-full').style.display = 'inline'; document.getElementById('2410.10149v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.10149v1-abstract-full" style="display: none;"> We propose a simple yet effective neural network-based framework for global illumination rendering. Recently, rendering techniques that learn neural radiance caches by minimizing the difference (i.e., residual) between the left and right sides of the rendering equation have been suggested. Due to their ease of implementation and the advantage of excluding path integral calculations, these techniques have been applied to various fields, such as free-viewpoint rendering, differentiable rendering, and real-time rendering. However, issues of slow training and occasionally darkened renders have been noted. We identify the cause of these issues as the bias and high variance present in the gradient estimates of the existing residual-based objective function. To address this, we introduce a new objective function that maintains the same global optimum as before but allows for unbiased and low-variance gradient estimates, enabling faster and more accurate training of neural networks. In conclusion, this method is simply implemented by ignoring the partial derivatives of the right-hand side, and theoretical and experimental analyses demonstrate the effectiveness of the proposed loss. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10149v1-abstract-full').style.display = 'none'; document.getElementById('2410.10149v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.09807">arXiv:2410.09807</a> <span> [<a href="https://arxiv.org/pdf/2410.09807">pdf</a>, <a href="https://arxiv.org/format/2410.09807">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Single Ground Truth Is Not Enough: Adding Flexibility to Aspect-Based Sentiment Analysis Evaluation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+S">Soyoung Yang</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+H">Hojun Cho</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+J">Jiyoung Lee</a>, <a href="/search/cs?searchtype=author&query=Yoon%2C+S">Sohee Yoon</a>, <a href="/search/cs?searchtype=author&query=Choi%2C+E">Edward Choi</a>, <a href="/search/cs?searchtype=author&query=Choo%2C+J">Jaegul Choo</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+W+I">Won Ik Cho</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.09807v2-abstract-short" style="display: inline;"> Aspect-based sentiment analysis (ABSA) is a challenging task of extracting sentiments along with their corresponding aspects and opinion terms from the text. The inherent subjectivity of span annotation makes variability in the surface forms of extracted terms, complicating the evaluation process. Traditional evaluation methods often constrain ground truths (GT) to a single term, potentially misre… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09807v2-abstract-full').style.display = 'inline'; document.getElementById('2410.09807v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.09807v2-abstract-full" style="display: none;"> Aspect-based sentiment analysis (ABSA) is a challenging task of extracting sentiments along with their corresponding aspects and opinion terms from the text. The inherent subjectivity of span annotation makes variability in the surface forms of extracted terms, complicating the evaluation process. Traditional evaluation methods often constrain ground truths (GT) to a single term, potentially misrepresenting the accuracy of semantically valid predictions that differ in surface form. To address this limitation, we propose a novel and fully automated pipeline that expands existing evaluation sets by adding alternative valid terms for aspect and opinion. Our approach facilitates an equitable assessment of language models by accommodating multiple-answer candidates, resulting in enhanced human agreement compared to single-answer test sets (achieving up to a 10\%p improvement in Kendall's Tau score). Experimental results demonstrate that our expanded evaluation set helps uncover the capabilities of large language models (LLMs) in ABSA tasks, which is concealed by the single-answer GT sets. Consequently, our work contributes to the development of a flexible evaluation framework for ABSA by embracing diverse surface forms to span extraction tasks in a cost-effective and reproducible manner. Our code and dataset is open at https://github.com/dudrrm/zoom-in-n-out-absa. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09807v2-abstract-full').style.display = 'none'; document.getElementById('2410.09807v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NAACL 2025 camera-ready</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.09754">arXiv:2410.09754</a> <span> [<a href="https://arxiv.org/pdf/2410.09754">pdf</a>, <a href="https://arxiv.org/format/2410.09754">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> SimBa: Simplicity Bias for Scaling Up Parameters in Deep Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lee%2C+H">Hojoon Lee</a>, <a href="/search/cs?searchtype=author&query=Hwang%2C+D">Dongyoon Hwang</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+D">Donghu Kim</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+H">Hyunseung Kim</a>, <a href="/search/cs?searchtype=author&query=Tai%2C+J+J">Jun Jet Tai</a>, <a href="/search/cs?searchtype=author&query=Subramanian%2C+K">Kaushik Subramanian</a>, <a href="/search/cs?searchtype=author&query=Wurman%2C+P+R">Peter R. Wurman</a>, <a href="/search/cs?searchtype=author&query=Choo%2C+J">Jaegul Choo</a>, <a href="/search/cs?searchtype=author&query=Stone%2C+P">Peter Stone</a>, <a href="/search/cs?searchtype=author&query=Seno%2C+T">Takuma Seno</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.09754v1-abstract-short" style="display: inline;"> Recent advances in CV and NLP have been largely driven by scaling up the number of network parameters, despite traditional theories suggesting that larger networks are prone to overfitting. These large networks avoid overfitting by integrating components that induce a simplicity bias, guiding models toward simple and generalizable solutions. However, in deep RL, designing and scaling up networks h… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09754v1-abstract-full').style.display = 'inline'; document.getElementById('2410.09754v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.09754v1-abstract-full" style="display: none;"> Recent advances in CV and NLP have been largely driven by scaling up the number of network parameters, despite traditional theories suggesting that larger networks are prone to overfitting. These large networks avoid overfitting by integrating components that induce a simplicity bias, guiding models toward simple and generalizable solutions. However, in deep RL, designing and scaling up networks have been less explored. Motivated by this opportunity, we present SimBa, an architecture designed to scale up parameters in deep RL by injecting a simplicity bias. SimBa consists of three components: (i) an observation normalization layer that standardizes inputs with running statistics, (ii) a residual feedforward block to provide a linear pathway from the input to output, and (iii) a layer normalization to control feature magnitudes. By scaling up parameters with SimBa, the sample efficiency of various deep RL algorithms-including off-policy, on-policy, and unsupervised methods-is consistently improved. Moreover, solely by integrating SimBa architecture into SAC, it matches or surpasses state-of-the-art deep RL methods with high computational efficiency across DMC, MyoSuite, and HumanoidBench. These results demonstrate SimBa's broad applicability and effectiveness across diverse RL algorithms and environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09754v1-abstract-full').style.display = 'none'; document.getElementById('2410.09754v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">preprint</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.07952">arXiv:2410.07952</a> <span> [<a href="https://arxiv.org/pdf/2410.07952">pdf</a>, <a href="https://arxiv.org/ps/2410.07952">ps</a>, <a href="https://arxiv.org/format/2410.07952">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> </div> </div> <p class="title is-5 mathjax"> Eco-driving Incentive Mechanisms for Mitigating Emissions in Urban Transportation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Niazi%2C+M+U+B">M. Umar B. Niazi</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+J">Jung-Hoon Cho</a>, <a href="/search/cs?searchtype=author&query=Dahleh%2C+M+A">Munther A. Dahleh</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+R">Roy Dong</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+C">Cathy Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.07952v1-abstract-short" style="display: inline;"> This paper proposes incentive mechanisms that promote eco-driving in transportation networks with the over-arching objective of minimizing emissions. The transportation system operator provides the drivers with energy-efficient driving guidance throughout their trips, and their eco-driving levels are measured by how closely they follow this guidance via vehicle telematics. Drivers choose their eco… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07952v1-abstract-full').style.display = 'inline'; document.getElementById('2410.07952v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.07952v1-abstract-full" style="display: none;"> This paper proposes incentive mechanisms that promote eco-driving in transportation networks with the over-arching objective of minimizing emissions. The transportation system operator provides the drivers with energy-efficient driving guidance throughout their trips, and their eco-driving levels are measured by how closely they follow this guidance via vehicle telematics. Drivers choose their eco-driving levels to optimize a combination of their travel times and their emissions. To obtain optimal budget allocation and recommendations for the incentive mechanism, the system operator gathers drivers' preferences, or types, to assess each driver's trip urgency and natural willingness to eco-drive. In a setting where drivers truthfully report their types, we introduce the first-best incentive mechanism and show that the obedience condition holds (i.e., drivers find it optimal to comply with the system operator's recommendations) when the recommended eco-driving profile constitutes a Nash equilibrium. Moreover, in a setting where drivers can strategically report their types, we introduce the second-best incentive mechanism and show that the proposed mechanism is incentive-compatible (i.e., drivers find it optimal to be truthful). Under this mechanism, we also show that all equilibrium outcomes are at least as good as the recommended eco-driving profile in terms of the system operator's objective. Overall, this work offers a framework for designing eco-driving incentive mechanisms while considering both the strategic behavior of individual drivers and the network effects of collective decision-making. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07952v1-abstract-full').style.display = 'none'; document.getElementById('2410.07952v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Cho%2C+J&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Cho%2C+J&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Cho%2C+J&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Cho%2C+J&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Cho%2C+J&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Cho%2C+J&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository