CINXE.COM
Search | arXiv e-print repository
<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 229 results for author: <span class="mathjax">Cho, M</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Cho%2C+M">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Cho, M"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Cho%2C+M&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Cho, M"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Cho%2C+M&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Cho%2C+M&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Cho%2C+M&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Cho%2C+M&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Cho%2C+M&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Cho%2C+M&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.12325">arXiv:2502.12325</a> <span> [<a href="https://arxiv.org/pdf/2502.12325">pdf</a>, <a href="https://arxiv.org/format/2502.12325">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> From Dense to Dynamic: Token-Difficulty Driven MoEfication of Pre-Trained LLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Nishu%2C+K">Kumari Nishu</a>, <a href="/search/cs?searchtype=author&query=Mehta%2C+S">Sachin Mehta</a>, <a href="/search/cs?searchtype=author&query=Abnar%2C+S">Samira Abnar</a>, <a href="/search/cs?searchtype=author&query=Farajtabar%2C+M">Mehrdad Farajtabar</a>, <a href="/search/cs?searchtype=author&query=Horton%2C+M">Maxwell Horton</a>, <a href="/search/cs?searchtype=author&query=Najibi%2C+M">Mahyar Najibi</a>, <a href="/search/cs?searchtype=author&query=Nabi%2C+M">Moin Nabi</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+M">Minsik Cho</a>, <a href="/search/cs?searchtype=author&query=Naik%2C+D">Devang Naik</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.12325v1-abstract-short" style="display: inline;"> Training large language models (LLMs) for different inference constraints is computationally expensive, limiting control over efficiency-accuracy trade-offs. Moreover, once trained, these models typically process tokens uniformly, regardless of their complexity, leading to static and inflexible behavior. In this paper, we introduce a post-training optimization framework, DynaMoE, that adapts a pre… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12325v1-abstract-full').style.display = 'inline'; document.getElementById('2502.12325v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.12325v1-abstract-full" style="display: none;"> Training large language models (LLMs) for different inference constraints is computationally expensive, limiting control over efficiency-accuracy trade-offs. Moreover, once trained, these models typically process tokens uniformly, regardless of their complexity, leading to static and inflexible behavior. In this paper, we introduce a post-training optimization framework, DynaMoE, that adapts a pre-trained dense LLM to a token-difficulty-driven Mixture-of-Experts model with minimal fine-tuning cost. This adaptation makes the model dynamic, with sensitivity control to customize the balance between efficiency and accuracy. DynaMoE features a token-difficulty-aware router that predicts the difficulty of tokens and directs them to the appropriate sub-networks or experts, enabling larger experts to handle more complex tokens and smaller experts to process simpler ones. Our experiments demonstrate that DynaMoE can generate a range of adaptive model variants of the existing trained LLM with a single fine-tuning step, utilizing only $10B$ tokens, a minimal cost compared to the base model's training. Each variant offers distinct trade-offs between accuracy and performance. Compared to the baseline post-training optimization framework, Flextron, our method achieves similar aggregated accuracy across downstream tasks, despite using only $\frac{1}{9}\text{th}$ of their fine-tuning cost. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12325v1-abstract-full').style.display = 'none'; document.getElementById('2502.12325v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.02548">arXiv:2502.02548</a> <span> [<a href="https://arxiv.org/pdf/2502.02548">pdf</a>, <a href="https://arxiv.org/format/2502.02548">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Mosaic3D: Foundation Dataset and Model for Open-Vocabulary 3D Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lee%2C+J">Junha Lee</a>, <a href="/search/cs?searchtype=author&query=Park%2C+C">Chunghyun Park</a>, <a href="/search/cs?searchtype=author&query=Choe%2C+J">Jaesung Choe</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y+F">Yu-Chiang Frank Wang</a>, <a href="/search/cs?searchtype=author&query=Kautz%2C+J">Jan Kautz</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+M">Minsu Cho</a>, <a href="/search/cs?searchtype=author&query=Choy%2C+C">Chris Choy</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.02548v1-abstract-short" style="display: inline;"> We tackle open-vocabulary 3D scene understanding by introducing a novel data generation pipeline and training framework. Our method addresses three critical requirements for effective training: precise 3D region segmentation, comprehensive textual descriptions, and sufficient dataset scale. By leveraging state-of-the-art open-vocabulary image segmentation models and region-aware Vision-Language Mo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02548v1-abstract-full').style.display = 'inline'; document.getElementById('2502.02548v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.02548v1-abstract-full" style="display: none;"> We tackle open-vocabulary 3D scene understanding by introducing a novel data generation pipeline and training framework. Our method addresses three critical requirements for effective training: precise 3D region segmentation, comprehensive textual descriptions, and sufficient dataset scale. By leveraging state-of-the-art open-vocabulary image segmentation models and region-aware Vision-Language Models, we develop an automatic pipeline that generates high-quality 3D mask-text pairs. Applying this pipeline to multiple 3D scene datasets, we create Mosaic3D-5.6M, a dataset of over 30K annotated scenes with 5.6M mask-text pairs, significantly larger than existing datasets. Building upon this data, we propose Mosaic3D, a foundation model combining a 3D encoder trained with contrastive learning and a lightweight mask decoder for open-vocabulary 3D semantic and instance segmentation. Our approach achieves state-of-the-art results on open-vocabulary 3D semantic and instance segmentation tasks including ScanNet200, Matterport3D, and ScanNet++, with ablation studies validating the effectiveness of our large-scale training data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02548v1-abstract-full').style.display = 'none'; document.getElementById('2502.02548v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">project page: https://nvlabs.github.io/Mosaic3D/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.11930">arXiv:2412.11930</a> <span> [<a href="https://arxiv.org/pdf/2412.11930">pdf</a>, <a href="https://arxiv.org/format/2412.11930">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Hierarchical Meta-Reinforcement Learning via Automated Macro-Action Discovery </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cho%2C+M">Minjae Cho</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+C">Chuangchuang Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.11930v1-abstract-short" style="display: inline;"> Meta-Reinforcement Learning (Meta-RL) enables fast adaptation to new testing tasks. Despite recent advancements, it is still challenging to learn performant policies across multiple complex and high-dimensional tasks. To address this, we propose a novel architecture with three hierarchical levels for 1) learning task representations, 2) discovering task-agnostic macro-actions in an automated manne… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.11930v1-abstract-full').style.display = 'inline'; document.getElementById('2412.11930v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.11930v1-abstract-full" style="display: none;"> Meta-Reinforcement Learning (Meta-RL) enables fast adaptation to new testing tasks. Despite recent advancements, it is still challenging to learn performant policies across multiple complex and high-dimensional tasks. To address this, we propose a novel architecture with three hierarchical levels for 1) learning task representations, 2) discovering task-agnostic macro-actions in an automated manner, and 3) learning primitive actions. The macro-action can guide the low-level primitive policy learning to more efficiently transition to goal states. This can address the issue that the policy may forget previously learned behavior while learning new, conflicting tasks. Moreover, the task-agnostic nature of the macro-actions is enabled by removing task-specific components from the state space. Hence, this makes them amenable to re-composition across different tasks and leads to promising fast adaptation to new tasks. Also, the prospective instability from the tri-level hierarchies is effectively mitigated by our innovative, independently tailored training schemes. Experiments in the MetaWorld framework demonstrate the improved sample efficiency and success rate of our approach compared to previous state-of-the-art methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.11930v1-abstract-full').style.display = 'none'; document.getElementById('2412.11930v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.04353">arXiv:2412.04353</a> <span> [<a href="https://arxiv.org/pdf/2412.04353">pdf</a>, <a href="https://arxiv.org/format/2412.04353">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> ActFusion: a Unified Diffusion Model for Action Segmentation and Anticipation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gong%2C+D">Dayoung Gong</a>, <a href="/search/cs?searchtype=author&query=Kwak%2C+S">Suha Kwak</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+M">Minsu Cho</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.04353v1-abstract-short" style="display: inline;"> Temporal action segmentation and long-term action anticipation are two popular vision tasks for the temporal analysis of actions in videos. Despite apparent relevance and potential complementarity, these two problems have been investigated as separate and distinct tasks. In this work, we tackle these two problems, action segmentation and action anticipation, jointly using a unified diffusion model… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.04353v1-abstract-full').style.display = 'inline'; document.getElementById('2412.04353v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.04353v1-abstract-full" style="display: none;"> Temporal action segmentation and long-term action anticipation are two popular vision tasks for the temporal analysis of actions in videos. Despite apparent relevance and potential complementarity, these two problems have been investigated as separate and distinct tasks. In this work, we tackle these two problems, action segmentation and action anticipation, jointly using a unified diffusion model dubbed ActFusion. The key idea to unification is to train the model to effectively handle both visible and invisible parts of the sequence in an integrated manner; the visible part is for temporal segmentation, and the invisible part is for future anticipation. To this end, we introduce a new anticipative masking strategy during training in which a late part of the video frames is masked as invisible, and learnable tokens replace these frames to learn to predict the invisible future. Experimental results demonstrate the bi-directional benefits between action segmentation and anticipation. ActFusion achieves the state-of-the-art performance across the standard benchmarks of 50 Salads, Breakfast, and GTEA, outperforming task-specific models in both of the two tasks with a single unified model through joint learning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.04353v1-abstract-full').style.display = 'none'; document.getElementById('2412.04353v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.03223">arXiv:2412.03223</a> <span> [<a href="https://arxiv.org/pdf/2412.03223">pdf</a>, <a href="https://arxiv.org/format/2412.03223">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Linq-Embed-Mistral Technical Report </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Choi%2C+C">Chanyeol Choi</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+J">Junseong Kim</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+S">Seolhwa Lee</a>, <a href="/search/cs?searchtype=author&query=Kwon%2C+J">Jihoon Kwon</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+S">Sangmo Gu</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+Y">Yejin Kim</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+M">Minkyung Cho</a>, <a href="/search/cs?searchtype=author&query=Sohn%2C+J">Jy-yong Sohn</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.03223v1-abstract-short" style="display: inline;"> This report explores the enhancement of text retrieval performance using advanced data refinement techniques. We develop Linq-Embed-Mistral\footnote{\url{https://huggingface.co/Linq-AI-Research/Linq-Embed-Mistral}} by building on the E5-mistral and Mistral-7B-v0.1 models, focusing on sophisticated data crafting, data filtering, and negative mining methods, which are highly tailored to each task, a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.03223v1-abstract-full').style.display = 'inline'; document.getElementById('2412.03223v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.03223v1-abstract-full" style="display: none;"> This report explores the enhancement of text retrieval performance using advanced data refinement techniques. We develop Linq-Embed-Mistral\footnote{\url{https://huggingface.co/Linq-AI-Research/Linq-Embed-Mistral}} by building on the E5-mistral and Mistral-7B-v0.1 models, focusing on sophisticated data crafting, data filtering, and negative mining methods, which are highly tailored to each task, applied to both existing benchmark dataset and highly tailored synthetic dataset generated via large language models (LLMs). Linq-Embed-Mistral excels in the MTEB benchmarks (as of May 29, 2024), achieving an average score of 68.2 across 56 datasets, and ranks 1st among all models for retrieval tasks on the MTEB leaderboard with a performance score of 60.2. This performance underscores its superior capability in enhancing search precision and reliability. Our contributions include advanced data refinement methods that significantly improve model performance on benchmark and synthetic datasets, techniques for homogeneous task ordering and mixed task fine-tuning to enhance model generalization and stability, and a streamlined evaluation process using 4-bit precision and a light retrieval evaluation set, which accelerates validation without sacrificing accuracy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.03223v1-abstract-full').style.display = 'none'; document.getElementById('2412.03223v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.03077">arXiv:2412.03077</a> <span> [<a href="https://arxiv.org/pdf/2412.03077">pdf</a>, <a href="https://arxiv.org/format/2412.03077">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> RoDyGS: Robust Dynamic Gaussian Splatting for Casual Videos </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jeong%2C+Y">Yoonwoo Jeong</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+J">Junmyeong Lee</a>, <a href="/search/cs?searchtype=author&query=Choi%2C+H">Hoseung Choi</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+M">Minsu Cho</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.03077v1-abstract-short" style="display: inline;"> Dynamic view synthesis (DVS) has advanced remarkably in recent years, achieving high-fidelity rendering while reducing computational costs. Despite the progress, optimizing dynamic neural fields from casual videos remains challenging, as these videos do not provide direct 3D information, such as camera trajectories or the underlying scene geometry. In this work, we present RoDyGS, an optimization… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.03077v1-abstract-full').style.display = 'inline'; document.getElementById('2412.03077v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.03077v1-abstract-full" style="display: none;"> Dynamic view synthesis (DVS) has advanced remarkably in recent years, achieving high-fidelity rendering while reducing computational costs. Despite the progress, optimizing dynamic neural fields from casual videos remains challenging, as these videos do not provide direct 3D information, such as camera trajectories or the underlying scene geometry. In this work, we present RoDyGS, an optimization pipeline for dynamic Gaussian Splatting from casual videos. It effectively learns motion and underlying geometry of scenes by separating dynamic and static primitives, and ensures that the learned motion and geometry are physically plausible by incorporating motion and geometric regularization terms. We also introduce a comprehensive benchmark, Kubric-MRig, that provides extensive camera and object motion along with simultaneous multi-view captures, features that are absent in previous benchmarks. Experimental results demonstrate that the proposed method significantly outperforms previous pose-free dynamic neural fields and achieves competitive rendering quality compared to existing pose-free static neural fields. The code and data are publicly available at https://rodygs.github.io/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.03077v1-abstract-full').style.display = 'none'; document.getElementById('2412.03077v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project Page: https://rodygs.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.18995">arXiv:2411.18995</a> <span> [<a href="https://arxiv.org/pdf/2411.18995">pdf</a>, <a href="https://arxiv.org/format/2411.18995">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MVFormer: Diversifying Feature Normalization and Token Mixing for Efficient Vision Transformers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bae%2C+J">Jongseong Bae</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+S">Susang Kim</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+M">Minsu Cho</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+H+Y">Ha Young Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.18995v1-abstract-short" style="display: inline;"> Active research is currently underway to enhance the efficiency of vision transformers (ViTs). Most studies have focused solely on effective token mixers, overlooking the potential relationship with normalization. To boost diverse feature learning, we propose two components: a normalization module called multi-view normalization (MVN) and a token mixer called multi-view token mixer (MVTM). The MVN… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18995v1-abstract-full').style.display = 'inline'; document.getElementById('2411.18995v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.18995v1-abstract-full" style="display: none;"> Active research is currently underway to enhance the efficiency of vision transformers (ViTs). Most studies have focused solely on effective token mixers, overlooking the potential relationship with normalization. To boost diverse feature learning, we propose two components: a normalization module called multi-view normalization (MVN) and a token mixer called multi-view token mixer (MVTM). The MVN integrates three differently normalized features via batch, layer, and instance normalization using a learnable weighted sum. Each normalization method outputs a different distribution, generating distinct features. Thus, the MVN is expected to offer diverse pattern information to the token mixer, resulting in beneficial synergy. The MVTM is a convolution-based multiscale token mixer with local, intermediate, and global filters, and it incorporates stage specificity by configuring various receptive fields for the token mixer at each stage, efficiently capturing ranges of visual patterns. We propose a novel ViT model, multi-vision transformer (MVFormer), adopting the MVN and MVTM in the MetaFormer block, the generalized ViT scheme. Our MVFormer outperforms state-of-the-art convolution-based ViTs on image classification, object detection, and instance and semantic segmentation with the same or lower parameters and MACs. Particularly, MVFormer variants, MVFormer-T, S, and B achieve 83.4%, 84.3%, and 84.6% top-1 accuracy, respectively, on ImageNet-1K benchmark. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18995v1-abstract-full').style.display = 'none'; document.getElementById('2411.18995v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07942">arXiv:2411.07942</a> <span> [<a href="https://arxiv.org/pdf/2411.07942">pdf</a>, <a href="https://arxiv.org/format/2411.07942">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Towards Low-bit Communication for Tensor Parallel LLM Inference </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dong%2C+H">Harry Dong</a>, <a href="/search/cs?searchtype=author&query=Johnson%2C+T">Tyler Johnson</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+M">Minsik Cho</a>, <a href="/search/cs?searchtype=author&query=Soroush%2C+E">Emad Soroush</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07942v1-abstract-short" style="display: inline;"> Tensor parallelism provides an effective way to increase server large language model (LLM) inference efficiency despite adding an additional communication cost. However, as server LLMs continue to scale in size, they will need to be distributed across more devices, magnifying the communication cost. One way to approach this problem is with quantization, but current methods for LLMs tend to avoid q… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07942v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07942v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07942v1-abstract-full" style="display: none;"> Tensor parallelism provides an effective way to increase server large language model (LLM) inference efficiency despite adding an additional communication cost. However, as server LLMs continue to scale in size, they will need to be distributed across more devices, magnifying the communication cost. One way to approach this problem is with quantization, but current methods for LLMs tend to avoid quantizing the features that tensor parallelism needs to communicate. Taking advantage of consistent outliers in communicated features, we introduce a quantization method that reduces communicated values on average from 16 bits to 4.2 bits while preserving nearly all of the original performance. For instance, our method maintains around 98.0% and 99.5% of Gemma 2 27B's and Llama 2 13B's original performance, respectively, averaged across all tasks we evaluated on. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07942v1-abstract-full').style.display = 'none'; document.getElementById('2411.07942v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00543">arXiv:2411.00543</a> <span> [<a href="https://arxiv.org/pdf/2411.00543">pdf</a>, <a href="https://arxiv.org/format/2411.00543">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> 3D Equivariant Pose Regression via Direct Wigner-D Harmonics Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lee%2C+J">Jongmin Lee</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+M">Minsu Cho</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00543v2-abstract-short" style="display: inline;"> Determining the 3D orientations of an object in an image, known as single-image pose estimation, is a crucial task in 3D vision applications. Existing methods typically learn 3D rotations parametrized in the spatial domain using Euler angles or quaternions, but these representations often introduce discontinuities and singularities. SO(3)-equivariant networks enable the structured capture of pose… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00543v2-abstract-full').style.display = 'inline'; document.getElementById('2411.00543v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00543v2-abstract-full" style="display: none;"> Determining the 3D orientations of an object in an image, known as single-image pose estimation, is a crucial task in 3D vision applications. Existing methods typically learn 3D rotations parametrized in the spatial domain using Euler angles or quaternions, but these representations often introduce discontinuities and singularities. SO(3)-equivariant networks enable the structured capture of pose patterns with data-efficient learning, but the parametrizations in spatial domain are incompatible with their architecture, particularly spherical CNNs, which operate in the frequency domain to enhance computational efficiency. To overcome these issues, we propose a frequency-domain approach that directly predicts Wigner-D coefficients for 3D rotation regression, aligning with the operations of spherical CNNs. Our SO(3)-equivariant pose harmonics predictor overcomes the limitations of spatial parameterizations, ensuring consistent pose estimation under arbitrary rotations. Trained with a frequency-domain regression loss, our method achieves state-of-the-art results on benchmarks such as ModelNet10-SO(3) and PASCAL3D+, with significant improvements in accuracy, robustness, and data efficiency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00543v2-abstract-full').style.display = 'none'; document.getElementById('2411.00543v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to NeurIPS 2024, Project webpage at http://cvlab.postech.ac.kr/research/3D_EquiPose</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.12592">arXiv:2410.12592</a> <span> [<a href="https://arxiv.org/pdf/2410.12592">pdf</a>, <a href="https://arxiv.org/format/2410.12592">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Cocoon: Robust Multi-Modal Perception with Uncertainty-Aware Sensor Fusion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cho%2C+M">Minkyoung Cho</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+Y">Yulong Cao</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+J">Jiachen Sun</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Qingzhao Zhang</a>, <a href="/search/cs?searchtype=author&query=Pavone%2C+M">Marco Pavone</a>, <a href="/search/cs?searchtype=author&query=Park%2C+J+J">Jeong Joon Park</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+H">Heng Yang</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+Z+M">Z. Morley Mao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.12592v1-abstract-short" style="display: inline;"> An important paradigm in 3D object detection is the use of multiple modalities to enhance accuracy in both normal and challenging conditions, particularly for long-tail scenarios. To address this, recent studies have explored two directions of adaptive approaches: MoE-based adaptive fusion, which struggles with uncertainties arising from distinct object configurations, and late fusion for output-l… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12592v1-abstract-full').style.display = 'inline'; document.getElementById('2410.12592v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.12592v1-abstract-full" style="display: none;"> An important paradigm in 3D object detection is the use of multiple modalities to enhance accuracy in both normal and challenging conditions, particularly for long-tail scenarios. To address this, recent studies have explored two directions of adaptive approaches: MoE-based adaptive fusion, which struggles with uncertainties arising from distinct object configurations, and late fusion for output-level adaptive fusion, which relies on separate detection pipelines and limits comprehensive understanding. In this work, we introduce Cocoon, an object- and feature-level uncertainty-aware fusion framework. The key innovation lies in uncertainty quantification for heterogeneous representations, enabling fair comparison across modalities through the introduction of a feature aligner and a learnable surrogate ground truth, termed feature impression. We also define a training objective to ensure that their relationship provides a valid metric for uncertainty quantification. Cocoon consistently outperforms existing static and adaptive methods in both normal and challenging conditions, including those with natural and artificial corruptions. Furthermore, we show the validity and efficacy of our uncertainty metric across diverse datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12592v1-abstract-full').style.display = 'none'; document.getElementById('2410.12592v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">23 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.10846">arXiv:2410.10846</a> <span> [<a href="https://arxiv.org/pdf/2410.10846">pdf</a>, <a href="https://arxiv.org/format/2410.10846">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Duo-LLM: A Framework for Studying Adaptive Computation in Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Alizadeh%2C+K">Keivan Alizadeh</a>, <a href="/search/cs?searchtype=author&query=Mirzadeh%2C+I">Iman Mirzadeh</a>, <a href="/search/cs?searchtype=author&query=Shahrokhi%2C+H">Hooman Shahrokhi</a>, <a href="/search/cs?searchtype=author&query=Belenko%2C+D">Dmitry Belenko</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+F">Frank Sun</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+M">Minsik Cho</a>, <a href="/search/cs?searchtype=author&query=Sekhavat%2C+M+H">Mohammad Hossein Sekhavat</a>, <a href="/search/cs?searchtype=author&query=Nabi%2C+M">Moin Nabi</a>, <a href="/search/cs?searchtype=author&query=Farajtabar%2C+M">Mehrdad Farajtabar</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.10846v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) typically generate outputs token by token using a fixed compute budget, leading to inefficient resource utilization. To address this shortcoming, recent advancements in mixture of expert (MoE) models, speculative decoding, and early exit strategies leverage the insight that computational demands can vary significantly based on the complexity and nature of the input. Ho… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10846v1-abstract-full').style.display = 'inline'; document.getElementById('2410.10846v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.10846v1-abstract-full" style="display: none;"> Large Language Models (LLMs) typically generate outputs token by token using a fixed compute budget, leading to inefficient resource utilization. To address this shortcoming, recent advancements in mixture of expert (MoE) models, speculative decoding, and early exit strategies leverage the insight that computational demands can vary significantly based on the complexity and nature of the input. However, identifying optimal routing patterns for dynamic execution remains an open challenge, limiting the full potential of these adaptive methods. To address this need, we study adaptive computation in LLMs more systematically. We propose a novel framework that integrates smaller auxiliary modules within each Feed-Forward Network layer of the LLM. This design enables dynamic routing of tokens based on task complexity: tokens can be processed by either the small or big modules at each layer, or even bypass certain layers entirely. This allows us to introduce a novel notion of a token's difficulty, defined by its potential to benefit from additional computational resources. Importantly, by employing oracles to identify optimal patterns of adaptive computations, we gain valuable insights into the internal workings of LLMs and the routing processes in a simplified heterogeneous MoE setup. We show that trained routers operate differently from oracles and often yield suboptimal solutions. Notably, activating a large module in just one layer outperforms models that use large modules across all layers, underscoring the gap between practical implementations of routing in MoE models and theoretical optima for adaptive computation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10846v1-abstract-full').style.display = 'none'; document.getElementById('2410.10846v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.09780">arXiv:2410.09780</a> <span> [<a href="https://arxiv.org/pdf/2410.09780">pdf</a>, <a href="https://arxiv.org/format/2410.09780">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Expanding Search Space with Diverse Prompting Agents: An Efficient Sampling Approach for LLM Mathematical Reasoning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lee%2C+G">Gisang Lee</a>, <a href="/search/cs?searchtype=author&query=Park%2C+S">Sangwoo Park</a>, <a href="/search/cs?searchtype=author&query=Park%2C+J">Junyoung Park</a>, <a href="/search/cs?searchtype=author&query=Chung%2C+A">Andrew Chung</a>, <a href="/search/cs?searchtype=author&query=Park%2C+S">Sieun Park</a>, <a href="/search/cs?searchtype=author&query=Park%2C+Y">Yoonah Park</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+B">Byungju Kim</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+M">Min-gyu Cho</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.09780v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) have exhibited remarkable capabilities in many complex tasks including mathematical reasoning. However, traditional approaches heavily rely on ensuring self-consistency within single prompting method, which limits the exploration of diverse problem-solving strategies. This study addresses these limitations by performing an experimental analysis of distinct prompting me… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09780v1-abstract-full').style.display = 'inline'; document.getElementById('2410.09780v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.09780v1-abstract-full" style="display: none;"> Large Language Models (LLMs) have exhibited remarkable capabilities in many complex tasks including mathematical reasoning. However, traditional approaches heavily rely on ensuring self-consistency within single prompting method, which limits the exploration of diverse problem-solving strategies. This study addresses these limitations by performing an experimental analysis of distinct prompting methods within the domain of mathematical reasoning. Our findings demonstrate that each method explores a distinct search space, and this differentiation becomes more evident with increasing problem complexity. To leverage this phenomenon, we applied efficient sampling process that uniformly combines samples from these diverse methods, which not only expands the maximum search space but achieves higher performance with fewer runs compared to single methods. Especially, within the subset of difficult questions of MATH dataset named MATH-hard, The maximum search space was achieved while utilizing approximately 43% fewer runs than single methods on average. These findings highlight the importance of integrating diverse problem-solving strategies to enhance the reasoning abilities of LLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09780v1-abstract-full').style.display = 'none'; document.getElementById('2410.09780v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">6 pages, 4 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.14805">arXiv:2409.14805</a> <span> [<a href="https://arxiv.org/pdf/2409.14805">pdf</a>, <a href="https://arxiv.org/format/2409.14805">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> SDBA: A Stealthy and Long-Lasting Durable Backdoor Attack in Federated Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Choe%2C+M">Minyeong Choe</a>, <a href="/search/cs?searchtype=author&query=Park%2C+C">Cheolhee Park</a>, <a href="/search/cs?searchtype=author&query=Seo%2C+C">Changho Seo</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+H">Hyunil Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.14805v1-abstract-short" style="display: inline;"> Federated Learning is a promising approach for training machine learning models while preserving data privacy, but its distributed nature makes it vulnerable to backdoor attacks, particularly in NLP tasks while related research remains limited. This paper introduces SDBA, a novel backdoor attack mechanism designed for NLP tasks in FL environments. Our systematic analysis across LSTM and GPT-2 mode… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.14805v1-abstract-full').style.display = 'inline'; document.getElementById('2409.14805v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.14805v1-abstract-full" style="display: none;"> Federated Learning is a promising approach for training machine learning models while preserving data privacy, but its distributed nature makes it vulnerable to backdoor attacks, particularly in NLP tasks while related research remains limited. This paper introduces SDBA, a novel backdoor attack mechanism designed for NLP tasks in FL environments. Our systematic analysis across LSTM and GPT-2 models identifies the most vulnerable layers for backdoor injection and achieves both stealth and long-lasting durability through layer-wise gradient masking and top-k% gradient masking within these layers. Experiments on next token prediction and sentiment analysis tasks show that SDBA outperforms existing backdoors in durability and effectively bypasses representative defense mechanisms, with notable performance in LLM such as GPT-2. These results underscore the need for robust defense strategies in NLP-based FL systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.14805v1-abstract-full').style.display = 'none'; document.getElementById('2409.14805v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 13 figures This work has been submitted to the IEEE for possible publication</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.12903">arXiv:2409.12903</a> <span> [<a href="https://arxiv.org/pdf/2409.12903">pdf</a>, <a href="https://arxiv.org/format/2409.12903">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Scaling Smart: Accelerating Large Language Model Pre-training with Small Model Initialization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Samragh%2C+M">Mohammad Samragh</a>, <a href="/search/cs?searchtype=author&query=Mirzadeh%2C+I">Iman Mirzadeh</a>, <a href="/search/cs?searchtype=author&query=Vahid%2C+K+A">Keivan Alizadeh Vahid</a>, <a href="/search/cs?searchtype=author&query=Faghri%2C+F">Fartash Faghri</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+M">Minsik Cho</a>, <a href="/search/cs?searchtype=author&query=Nabi%2C+M">Moin Nabi</a>, <a href="/search/cs?searchtype=author&query=Naik%2C+D">Devang Naik</a>, <a href="/search/cs?searchtype=author&query=Farajtabar%2C+M">Mehrdad Farajtabar</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.12903v2-abstract-short" style="display: inline;"> The pre-training phase of language models often begins with randomly initialized parameters. With the current trends in scaling models, training their large number of parameters can be extremely slow and costly. In contrast, small language models are less expensive to train, but they often cannot achieve the accuracy of large models. In this paper, we explore an intriguing idea to connect these tw… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12903v2-abstract-full').style.display = 'inline'; document.getElementById('2409.12903v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.12903v2-abstract-full" style="display: none;"> The pre-training phase of language models often begins with randomly initialized parameters. With the current trends in scaling models, training their large number of parameters can be extremely slow and costly. In contrast, small language models are less expensive to train, but they often cannot achieve the accuracy of large models. In this paper, we explore an intriguing idea to connect these two different regimes: Can we develop a method to initialize large language models using smaller pre-trained models? Will such initialization bring any benefits in terms of training time and final accuracy? In this paper, we introduce HyperCloning, a method that can expand the parameters of a pre-trained language model to those of a larger model with increased hidden dimensions. Our method ensures that the larger model retains the functionality of the smaller model. As a result, the larger model already inherits the predictive power and accuracy of the smaller model before the training starts. We demonstrate that training such an initialized model results in significant savings in terms of GPU hours required for pre-training large language models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12903v2-abstract-full').style.display = 'none'; document.getElementById('2409.12903v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.09067">arXiv:2409.09067</a> <span> [<a href="https://arxiv.org/pdf/2409.09067">pdf</a>, <a href="https://arxiv.org/format/2409.09067">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> SLiCK: Exploiting Subsequences for Length-Constrained Keyword Spotting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Nishu%2C+K">Kumari Nishu</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+M">Minsik Cho</a>, <a href="/search/cs?searchtype=author&query=Naik%2C+D">Devang Naik</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.09067v1-abstract-short" style="display: inline;"> User-defined keyword spotting on a resource-constrained edge device is challenging. However, keywords are often bounded by a maximum keyword length, which has been largely under-leveraged in prior works. Our analysis of keyword-length distribution shows that user-defined keyword spotting can be treated as a length-constrained problem, eliminating the need for aggregation over variable text length.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09067v1-abstract-full').style.display = 'inline'; document.getElementById('2409.09067v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.09067v1-abstract-full" style="display: none;"> User-defined keyword spotting on a resource-constrained edge device is challenging. However, keywords are often bounded by a maximum keyword length, which has been largely under-leveraged in prior works. Our analysis of keyword-length distribution shows that user-defined keyword spotting can be treated as a length-constrained problem, eliminating the need for aggregation over variable text length. This leads to our proposed method for efficient keyword spotting, SLiCK (exploiting Subsequences for Length-Constrained Keyword spotting). We further introduce a subsequence-level matching scheme to learn audio-text relations at a finer granularity, thus distinguishing similar-sounding keywords more effectively through enhanced context. In SLiCK, the model is trained with a multi-task learning approach using two modules: Matcher (utterance-level matching task, novel subsequence-level matching task) and Encoder (phoneme recognition task). The proposed method improves the baseline results on Libriphrase hard dataset, increasing AUC from $88.52$ to $94.9$ and reducing EER from $18.82$ to $11.1$. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09067v1-abstract-full').style.display = 'none'; document.getElementById('2409.09067v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.03899">arXiv:2409.03899</a> <span> [<a href="https://arxiv.org/pdf/2409.03899">pdf</a>, <a href="https://arxiv.org/format/2409.03899">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3689936.3694694">10.1145/3689936.3694694 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Achieving the Safety and Security of the End-to-End AV Pipeline </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Curran%2C+N+T">Noah T. Curran</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+M">Minkyoung Cho</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+R">Ryan Feng</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Liangkai Liu</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+B+J">Brian Jay Tang</a>, <a href="/search/cs?searchtype=author&query=MohajerAnsari%2C+P">Pedram MohajerAnsari</a>, <a href="/search/cs?searchtype=author&query=Domeke%2C+A">Alkim Domeke</a>, <a href="/search/cs?searchtype=author&query=Pes%C3%A9%2C+M+D">Mert D. Pes茅</a>, <a href="/search/cs?searchtype=author&query=Shin%2C+K+G">Kang G. Shin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.03899v1-abstract-short" style="display: inline;"> In the current landscape of autonomous vehicle (AV) safety and security research, there are multiple isolated problems being tackled by the community at large. Due to the lack of common evaluation criteria, several important research questions are at odds with one another. For instance, while much research has been conducted on physical attacks deceiving AV perception systems, there is often inade… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.03899v1-abstract-full').style.display = 'inline'; document.getElementById('2409.03899v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.03899v1-abstract-full" style="display: none;"> In the current landscape of autonomous vehicle (AV) safety and security research, there are multiple isolated problems being tackled by the community at large. Due to the lack of common evaluation criteria, several important research questions are at odds with one another. For instance, while much research has been conducted on physical attacks deceiving AV perception systems, there is often inadequate investigations on working defenses and on the downstream effects of safe vehicle control. This paper provides a thorough description of the current state of AV safety and security research. We provide individual sections for the primary research questions that concern this research area, including AV surveillance, sensor system reliability, security of the AV stack, algorithmic robustness, and safe environment interaction. We wrap up the paper with a discussion of the issues that concern the interactions of these separate problems. At the conclusion of each section, we propose future research questions that still lack conclusive answers. This position article will serve as an entry point to novice and veteran researchers seeking to partake in this research domain. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.03899v1-abstract-full').style.display = 'none'; document.getElementById('2409.03899v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to 1st Cyber Security in Cars Workshop (CSCS) at CCS</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.02838">arXiv:2409.02838</a> <span> [<a href="https://arxiv.org/pdf/2409.02838">pdf</a>, <a href="https://arxiv.org/format/2409.02838">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> iConFormer: Dynamic Parameter-Efficient Tuning with Input-Conditioned Adaptation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jo%2C+H">Hayeon Jo</a>, <a href="/search/cs?searchtype=author&query=Choi%2C+H">Hyesong Choi</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+M">Minhee Cho</a>, <a href="/search/cs?searchtype=author&query=Min%2C+D">Dongbo Min</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.02838v1-abstract-short" style="display: inline;"> Transfer learning based on full fine-tuning (FFT) of the pre-trained encoder and task-specific decoder becomes increasingly complex as deep models grow exponentially. Parameter efficient fine-tuning (PEFT) approaches using adapters consisting of small learnable layers have emerged as an alternative to FFT, achieving comparable performance while maintaining high training efficiency. However, the in… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.02838v1-abstract-full').style.display = 'inline'; document.getElementById('2409.02838v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.02838v1-abstract-full" style="display: none;"> Transfer learning based on full fine-tuning (FFT) of the pre-trained encoder and task-specific decoder becomes increasingly complex as deep models grow exponentially. Parameter efficient fine-tuning (PEFT) approaches using adapters consisting of small learnable layers have emerged as an alternative to FFT, achieving comparable performance while maintaining high training efficiency. However, the inflexibility of the adapter with respect to input instances limits its capability of learning task-specific information in diverse downstream tasks. In this paper, we propose a novel PEFT approach, input-Conditioned transFormer, termed iConFormer, that leverages a dynamic adapter conditioned on the input instances. To secure flexible learning ability on input instances in various downstream tasks, we introduce an input-Conditioned Network (iCoN) in the dynamic adapter that enables instance-level feature transformation. To be specific, iCoN generates channel-wise convolutional kernels for each feature and transform it using adaptive convolution process to effectively capture task-specific and fine-grained details tailor to downstream tasks. Experimental results demonstrate that by tuning just 1.6% to 2.8% of the Transformer backbone parameters, iConFormer achieves performance comparable to FFT in monocular depth estimation and semantic segmentation, while outperforming it in image classification and instance segmentation. Also, the proposed method consistently outperforms recent PEFT methods for all the tasks mentioned above. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.02838v1-abstract-full').style.display = 'none'; document.getElementById('2409.02838v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.02699">arXiv:2409.02699</a> <span> [<a href="https://arxiv.org/pdf/2409.02699">pdf</a>, <a href="https://arxiv.org/format/2409.02699">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> CLDA: Collaborative Learning for Enhanced Unsupervised Domain Adaptation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cho%2C+M">Minhee Cho</a>, <a href="/search/cs?searchtype=author&query=Choi%2C+H">Hyesong Choi</a>, <a href="/search/cs?searchtype=author&query=Jo%2C+H">Hayeon Jo</a>, <a href="/search/cs?searchtype=author&query=Min%2C+D">Dongbo Min</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.02699v1-abstract-short" style="display: inline;"> Unsupervised Domain Adaptation (UDA) endeavors to bridge the gap between a model trained on a labeled source domain and its deployment in an unlabeled target domain. However, current high-performance models demand significant resources, resulting in prohibitive deployment costs and highlighting the need for small yet effective models. For UDA of lightweight models, Knowledge Distillation (KD) in a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.02699v1-abstract-full').style.display = 'inline'; document.getElementById('2409.02699v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.02699v1-abstract-full" style="display: none;"> Unsupervised Domain Adaptation (UDA) endeavors to bridge the gap between a model trained on a labeled source domain and its deployment in an unlabeled target domain. However, current high-performance models demand significant resources, resulting in prohibitive deployment costs and highlighting the need for small yet effective models. For UDA of lightweight models, Knowledge Distillation (KD) in a Teacher-Student framework can be a common approach, but we find that domain shift in UDA leads to a significant increase in non-salient parameters in the teacher model, degrading model's generalization ability and transferring misleading information to the student model. Interestingly, we observed that this phenomenon occurs considerably less in the student model. Driven by this insight, we introduce Collaborative Learning, a method that updates the teacher's non-salient parameters using the student model and at the same time enhance the student's performance using the updated teacher model. Experiments across various tasks and datasets show consistent performance improvements for both student and teacher models. For example, in semantic segmentation, CLDA achieves an improvement of +0.7% mIoU for teacher and +1.4% mIoU for student compared to the baseline model in the GTA to Cityscapes. In the Synthia to Cityscapes, it achieves an improvement of +0.8% mIoU for teacher and +2.0% mIoU for student. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.02699v1-abstract-full').style.display = 'none'; document.getElementById('2409.02699v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.12004">arXiv:2408.12004</a> <span> [<a href="https://arxiv.org/pdf/2408.12004">pdf</a>, <a href="https://arxiv.org/format/2408.12004">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Methodology">stat.ME</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> CSPI-MT: Calibrated Safe Policy Improvement with Multiple Testing for Threshold Policies </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cho%2C+B+M">Brian M Cho</a>, <a href="/search/cs?searchtype=author&query=Pop%2C+A">Ana-Roxana Pop</a>, <a href="/search/cs?searchtype=author&query=Gan%2C+K">Kyra Gan</a>, <a href="/search/cs?searchtype=author&query=Corbett-Davies%2C+S">Sam Corbett-Davies</a>, <a href="/search/cs?searchtype=author&query=Nir%2C+I">Israel Nir</a>, <a href="/search/cs?searchtype=author&query=Evnine%2C+A">Ariel Evnine</a>, <a href="/search/cs?searchtype=author&query=Kallus%2C+N">Nathan Kallus</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.12004v1-abstract-short" style="display: inline;"> When modifying existing policies in high-risk settings, it is often necessary to ensure with high certainty that the newly proposed policy improves upon a baseline, such as the status quo. In this work, we consider the problem of safe policy improvement, where one only adopts a new policy if it is deemed to be better than the specified baseline with at least pre-specified probability. We focus on… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.12004v1-abstract-full').style.display = 'inline'; document.getElementById('2408.12004v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.12004v1-abstract-full" style="display: none;"> When modifying existing policies in high-risk settings, it is often necessary to ensure with high certainty that the newly proposed policy improves upon a baseline, such as the status quo. In this work, we consider the problem of safe policy improvement, where one only adopts a new policy if it is deemed to be better than the specified baseline with at least pre-specified probability. We focus on threshold policies, a ubiquitous class of policies with applications in economics, healthcare, and digital advertising. Existing methods rely on potentially underpowered safety checks and limit the opportunities for finding safe improvements, so too often they must revert to the baseline to maintain safety. We overcome these issues by leveraging the most powerful safety test in the asymptotic regime and allowing for multiple candidates to be tested for improvement over the baseline. We show that in adversarial settings, our approach controls the rate of adopting a policy worse than the baseline to the pre-specified error level, even in moderate sample sizes. We present CSPI and CSPI-MT, two novel heuristics for selecting cutoff(s) to maximize the policy improvement from baseline. We demonstrate through both synthetic and external datasets that our approaches improve both the detection rates of safe policies and the realized improvement, particularly under stringent safety requirements and low signal-to-noise conditions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.12004v1-abstract-full').style.display = 'none'; document.getElementById('2408.12004v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.11155">arXiv:2408.11155</a> <span> [<a href="https://arxiv.org/pdf/2408.11155">pdf</a>, <a href="https://arxiv.org/format/2408.11155">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Range-based Multi-Robot Integrity Monitoring Against Cyberattacks and Faults: An Anchor-Free Approach </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Vijay%2C+V">Vishnu Vijay</a>, <a href="/search/cs?searchtype=author&query=Pant%2C+K+A">Kartik A. Pant</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+M">Minhyun Cho</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Y">Yifan Guo</a>, <a href="/search/cs?searchtype=author&query=Goppert%2C+J+M">James M. Goppert</a>, <a href="/search/cs?searchtype=author&query=Hwang%2C+I">Inseok Hwang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.11155v1-abstract-short" style="display: inline;"> Coordination of multi-robot systems (MRSs) relies on efficient sensing and reliable communication among the robots. However, the sensors and communication channels of these robots are often vulnerable to cyberattacks and faults, which can disrupt their individual behavior and the overall objective of the MRS. In this work, we present a multi-robot integrity monitoring framework that utilizes inter… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.11155v1-abstract-full').style.display = 'inline'; document.getElementById('2408.11155v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.11155v1-abstract-full" style="display: none;"> Coordination of multi-robot systems (MRSs) relies on efficient sensing and reliable communication among the robots. However, the sensors and communication channels of these robots are often vulnerable to cyberattacks and faults, which can disrupt their individual behavior and the overall objective of the MRS. In this work, we present a multi-robot integrity monitoring framework that utilizes inter-robot range measurements to (i) detect the presence of cyberattacks or faults affecting the MRS, (ii) identify the affected robot(s), and (iii) reconstruct the resulting localization error of these robot(s). The proposed iterative algorithm leverages sequential convex programming and alternating direction of multipliers method to enable real-time and distributed implementation. Our approach is validated using numerical simulations and demonstrated using PX4-SiTL in Gazebo on an MRS, where certain agents deviate from their desired position due to a GNSS spoofing attack. Furthermore, we demonstrate the scalability and interoperability of our algorithm through mixed-reality experiments by forming a heterogeneous MRS comprising real Crazyflie UAVs and virtual PX4-SiTL UAVs working in tandem. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.11155v1-abstract-full').style.display = 'none'; document.getElementById('2408.11155v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 7 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.05917">arXiv:2408.05917</a> <span> [<a href="https://arxiv.org/pdf/2408.05917">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computational Engineering, Finance, and Science">cs.CE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Inverse design of Non-parameterized Ventilated Acoustic Resonator via Variational Autoencoder with Acoustic Response-encoded Latent Space </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cho%2C+M+W">Min Woo Cho</a>, <a href="/search/cs?searchtype=author&query=Hwang%2C+S+H">Seok Hyeon Hwang</a>, <a href="/search/cs?searchtype=author&query=Jang%2C+J">Jun-Young Jang</a>, <a href="/search/cs?searchtype=author&query=Song%2C+J+Y">Jin Yeong Song</a>, <a href="/search/cs?searchtype=author&query=Hwang%2C+S">Sun-kwang Hwang</a>, <a href="/search/cs?searchtype=author&query=Cha%2C+K+J">Kyoung Je Cha</a>, <a href="/search/cs?searchtype=author&query=Park%2C+D+Y">Dong Yong Park</a>, <a href="/search/cs?searchtype=author&query=Song%2C+K">Kyungjun Song</a>, <a href="/search/cs?searchtype=author&query=Park%2C+S+M">Sang Min Park</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.05917v1-abstract-short" style="display: inline;"> Ventilated acoustic resonator(VAR), a type of acoustic metamaterial, emerge as an alternative for sound attenuation in environments that require ventilation, owing to its excellent low-frequency attenuation performance and flexible shape adaptability. However, due to the non-linear acoustic responses of VARs, the VAR designs are generally obtained within a limited parametrized design space, and th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.05917v1-abstract-full').style.display = 'inline'; document.getElementById('2408.05917v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.05917v1-abstract-full" style="display: none;"> Ventilated acoustic resonator(VAR), a type of acoustic metamaterial, emerge as an alternative for sound attenuation in environments that require ventilation, owing to its excellent low-frequency attenuation performance and flexible shape adaptability. However, due to the non-linear acoustic responses of VARs, the VAR designs are generally obtained within a limited parametrized design space, and the design relies on the iteration of the numerical simulation which consumes a considerable amount of computational time and resources. This paper proposes an acoustic response-encoded variational autoencoder (AR-VAE), a novel variational autoencoder-based generative design model for the efficient and accurate inverse design of VAR even with non-parametrized designs. The AR-VAE matches the high-dimensional acoustic response with the VAR cross-section image in the dimension-reduced latent space, which enables the AR-VAE to generate various non-parametrized VAR cross-section images with the target acoustic response. AR-VAE generates non-parameterized VARs from target acoustic responses, which show a 25-fold reduction in mean squared error compared to conventional deep learning-based parameter searching methods while exhibiting lower average mean squared error and peak frequency variance. By combining the inverse-designed VARs by AR-VAE, multi-cavity VAR was devised for broadband and multitarget peak frequency attenuation. The proposed design method presents a new approach for structural inverse-design with a high-dimensional non-linear physical response. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.05917v1-abstract-full').style.display = 'none'; document.getElementById('2408.05917v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.04961">arXiv:2408.04961</a> <span> [<a href="https://arxiv.org/pdf/2408.04961">pdf</a>, <a href="https://arxiv.org/format/2408.04961">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> In Defense of Lazy Visual Grounding for Open-Vocabulary Semantic Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kang%2C+D">Dahyun Kang</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+M">Minsu Cho</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.04961v1-abstract-short" style="display: inline;"> We present lazy visual grounding, a two-stage approach of unsupervised object mask discovery followed by object grounding, for open-vocabulary semantic segmentation. Plenty of the previous art casts this task as pixel-to-text classification without object-level comprehension, leveraging the image-to-text classification capability of pretrained vision-and-language models. We argue that visual objec… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.04961v1-abstract-full').style.display = 'inline'; document.getElementById('2408.04961v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.04961v1-abstract-full" style="display: none;"> We present lazy visual grounding, a two-stage approach of unsupervised object mask discovery followed by object grounding, for open-vocabulary semantic segmentation. Plenty of the previous art casts this task as pixel-to-text classification without object-level comprehension, leveraging the image-to-text classification capability of pretrained vision-and-language models. We argue that visual objects are distinguishable without the prior text information as segmentation is essentially a vision task. Lazy visual grounding first discovers object masks covering an image with iterative Normalized cuts and then later assigns text on the discovered objects in a late interaction manner. Our model requires no additional training yet shows great performance on five public datasets: Pascal VOC, Pascal Context, COCO-object, COCO-stuff, and ADE 20K. Especially, the visually appealing segmentation results demonstrate the model capability to localize objects precisely. Paper homepage: https://cvlab.postech.ac.kr/research/lazygrounding <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.04961v1-abstract-full').style.display = 'none'; document.getElementById('2408.04961v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ECCV 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.02957">arXiv:2408.02957</a> <span> [<a href="https://arxiv.org/pdf/2408.02957">pdf</a>, <a href="https://arxiv.org/format/2408.02957">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Online Temporal Action Localization with Memory-Augmented Transformer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Song%2C+Y">Youngkil Song</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+D">Dongkeun Kim</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+M">Minsu Cho</a>, <a href="/search/cs?searchtype=author&query=Kwak%2C+S">Suha Kwak</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.02957v1-abstract-short" style="display: inline;"> Online temporal action localization (On-TAL) is the task of identifying multiple action instances given a streaming video. Since existing methods take as input only a video segment of fixed size per iteration, they are limited in considering long-term context and require tuning the segment size carefully. To overcome these limitations, we propose memory-augmented transformer (MATR). MATR utilizes… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.02957v1-abstract-full').style.display = 'inline'; document.getElementById('2408.02957v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.02957v1-abstract-full" style="display: none;"> Online temporal action localization (On-TAL) is the task of identifying multiple action instances given a streaming video. Since existing methods take as input only a video segment of fixed size per iteration, they are limited in considering long-term context and require tuning the segment size carefully. To overcome these limitations, we propose memory-augmented transformer (MATR). MATR utilizes the memory queue that selectively preserves the past segment features, allowing to leverage long-term context for inference. We also propose a novel action localization method that observes the current input segment to predict the end time of the ongoing action and accesses the memory queue to estimate the start time of the action. Our method outperformed existing methods on two datasets, THUMOS14 and MUSES, surpassing not only TAL methods in the online setting but also some offline TAL methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.02957v1-abstract-full').style.display = 'none'; document.getElementById('2408.02957v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ECCV 2024, Project page: https://cvlab.postech.ac.kr/research/MATR/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.21075">arXiv:2407.21075</a> <span> [<a href="https://arxiv.org/pdf/2407.21075">pdf</a>, <a href="https://arxiv.org/format/2407.21075">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Apple Intelligence Foundation Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gunter%2C+T">Tom Gunter</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zirui Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chong Wang</a>, <a href="/search/cs?searchtype=author&query=Pang%2C+R">Ruoming Pang</a>, <a href="/search/cs?searchtype=author&query=Narayanan%2C+A">Andy Narayanan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+A">Aonan Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Bowen Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C">Chen Chen</a>, <a href="/search/cs?searchtype=author&query=Chiu%2C+C">Chung-Cheng Chiu</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+D">David Qiu</a>, <a href="/search/cs?searchtype=author&query=Gopinath%2C+D">Deepak Gopinath</a>, <a href="/search/cs?searchtype=author&query=Yap%2C+D+A">Dian Ang Yap</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+D">Dong Yin</a>, <a href="/search/cs?searchtype=author&query=Nan%2C+F">Feng Nan</a>, <a href="/search/cs?searchtype=author&query=Weers%2C+F">Floris Weers</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+G">Guoli Yin</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+H">Haoshuo Huang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jianyu Wang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+J">Jiarui Lu</a>, <a href="/search/cs?searchtype=author&query=Peebles%2C+J">John Peebles</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+K">Ke Ye</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+M">Mark Lee</a>, <a href="/search/cs?searchtype=author&query=Du%2C+N">Nan Du</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Q">Qibin Chen</a>, <a href="/search/cs?searchtype=author&query=Keunebroek%2C+Q">Quentin Keunebroek</a> , et al. (130 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.21075v1-abstract-short" style="display: inline;"> We present foundation language models developed to power Apple Intelligence features, including a ~3 billion parameter model designed to run efficiently on devices and a large server-based language model designed for Private Cloud Compute. These models are designed to perform a wide range of tasks efficiently, accurately, and responsibly. This report describes the model architecture, the data used… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.21075v1-abstract-full').style.display = 'inline'; document.getElementById('2407.21075v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.21075v1-abstract-full" style="display: none;"> We present foundation language models developed to power Apple Intelligence features, including a ~3 billion parameter model designed to run efficiently on devices and a large server-based language model designed for Private Cloud Compute. These models are designed to perform a wide range of tasks efficiently, accurately, and responsibly. This report describes the model architecture, the data used to train the model, the training process, how the models are optimized for inference, and the evaluation results. We highlight our focus on Responsible AI and how the principles are applied throughout the model development. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.21075v1-abstract-full').style.display = 'none'; document.getElementById('2407.21075v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.19698">arXiv:2407.19698</a> <span> [<a href="https://arxiv.org/pdf/2407.19698">pdf</a>, <a href="https://arxiv.org/format/2407.19698">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Classification Matters: Improving Video Action Detection with Class-Specific Attention </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lee%2C+J">Jinsung Lee</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+T">Taeoh Kim</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+I">Inwoong Lee</a>, <a href="/search/cs?searchtype=author&query=Shim%2C+M">Minho Shim</a>, <a href="/search/cs?searchtype=author&query=Wee%2C+D">Dongyoon Wee</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+M">Minsu Cho</a>, <a href="/search/cs?searchtype=author&query=Kwak%2C+S">Suha Kwak</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.19698v4-abstract-short" style="display: inline;"> Video action detection (VAD) aims to detect actors and classify their actions in a video. We figure that VAD suffers more from classification rather than localization of actors. Hence, we analyze how prevailing methods form features for classification and find that they prioritize actor regions, yet often overlooking the essential contextual information necessary for accurate classification. Accor… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.19698v4-abstract-full').style.display = 'inline'; document.getElementById('2407.19698v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.19698v4-abstract-full" style="display: none;"> Video action detection (VAD) aims to detect actors and classify their actions in a video. We figure that VAD suffers more from classification rather than localization of actors. Hence, we analyze how prevailing methods form features for classification and find that they prioritize actor regions, yet often overlooking the essential contextual information necessary for accurate classification. Accordingly, we propose to reduce the bias toward actor and encourage paying attention to the context that is relevant to each action class. By assigning a class-dedicated query to each action class, our model can dynamically determine where to focus for effective classification. The proposed model demonstrates superior performance on three challenging benchmarks with significantly fewer parameters and less computation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.19698v4-abstract-full').style.display = 'none'; document.getElementById('2407.19698v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">31 pages, accepted to ECCV 2024 (oral)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.14057">arXiv:2407.14057</a> <span> [<a href="https://arxiv.org/pdf/2407.14057">pdf</a>, <a href="https://arxiv.org/format/2407.14057">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> LazyLLM: Dynamic Token Pruning for Efficient Long Context LLM Inference </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fu%2C+Q">Qichen Fu</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+M">Minsik Cho</a>, <a href="/search/cs?searchtype=author&query=Merth%2C+T">Thomas Merth</a>, <a href="/search/cs?searchtype=author&query=Mehta%2C+S">Sachin Mehta</a>, <a href="/search/cs?searchtype=author&query=Rastegari%2C+M">Mohammad Rastegari</a>, <a href="/search/cs?searchtype=author&query=Najibi%2C+M">Mahyar Najibi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.14057v1-abstract-short" style="display: inline;"> The inference of transformer-based large language models consists of two sequential stages: 1) a prefilling stage to compute the KV cache of prompts and generate the first token, and 2) a decoding stage to generate subsequent tokens. For long prompts, the KV cache must be computed for all tokens during the prefilling stage, which can significantly increase the time needed to generate the first tok… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.14057v1-abstract-full').style.display = 'inline'; document.getElementById('2407.14057v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.14057v1-abstract-full" style="display: none;"> The inference of transformer-based large language models consists of two sequential stages: 1) a prefilling stage to compute the KV cache of prompts and generate the first token, and 2) a decoding stage to generate subsequent tokens. For long prompts, the KV cache must be computed for all tokens during the prefilling stage, which can significantly increase the time needed to generate the first token. Consequently, the prefilling stage may become a bottleneck in the generation process. An open question remains whether all prompt tokens are essential for generating the first token. To answer this, we introduce a novel method, LazyLLM, that selectively computes the KV for tokens important for the next token prediction in both the prefilling and decoding stages. Contrary to static pruning approaches that prune the prompt at once, LazyLLM allows language models to dynamically select different subsets of tokens from the context in different generation steps, even though they might be pruned in previous steps. Extensive experiments on standard datasets across various tasks demonstrate that LazyLLM is a generic method that can be seamlessly integrated with existing language models to significantly accelerate the generation without fine-tuning. For instance, in the multi-document question-answering task, LazyLLM accelerates the prefilling stage of the LLama 2 7B model by 2.34x while maintaining accuracy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.14057v1-abstract-full').style.display = 'none'; document.getElementById('2407.14057v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.13006">arXiv:2407.13006</a> <span> [<a href="https://arxiv.org/pdf/2407.13006">pdf</a>, <a href="https://arxiv.org/format/2407.13006">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Sparsity-based Safety Conservatism for Constrained Offline Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cho%2C+M">Minjae Cho</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+C">Chuangchuang Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.13006v1-abstract-short" style="display: inline;"> Reinforcement Learning (RL) has made notable success in decision-making fields like autonomous driving and robotic manipulation. Yet, its reliance on real-time feedback poses challenges in costly or hazardous settings. Furthermore, RL's training approach, centered on "on-policy" sampling, doesn't fully capitalize on data. Hence, Offline RL has emerged as a compelling alternative, particularly in c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.13006v1-abstract-full').style.display = 'inline'; document.getElementById('2407.13006v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.13006v1-abstract-full" style="display: none;"> Reinforcement Learning (RL) has made notable success in decision-making fields like autonomous driving and robotic manipulation. Yet, its reliance on real-time feedback poses challenges in costly or hazardous settings. Furthermore, RL's training approach, centered on "on-policy" sampling, doesn't fully capitalize on data. Hence, Offline RL has emerged as a compelling alternative, particularly in conducting additional experiments is impractical, and abundant datasets are available. However, the challenge of distributional shift (extrapolation), indicating the disparity between data distributions and learning policies, also poses a risk in offline RL, potentially leading to significant safety breaches due to estimation errors (interpolation). This concern is particularly pronounced in safety-critical domains, where real-world problems are prevalent. To address both extrapolation and interpolation errors, numerous studies have introduced additional constraints to confine policy behavior, steering it towards more cautious decision-making. While many studies have addressed extrapolation errors, fewer have focused on providing effective solutions for tackling interpolation errors. For example, some works tackle this issue by incorporating potential cost-maximizing optimization by perturbing the original dataset. However, this, involving a bi-level optimization structure, may introduce significant instability or complicate problem-solving in high-dimensional tasks. This motivates us to pinpoint areas where hazards may be more prevalent than initially estimated based on the sparsity of available data by providing significant insight into constrained offline RL. In this paper, we present conservative metrics based on data sparsity that demonstrate the high generalizability to any methods and efficacy compared to using bi-level cost-ub-maximization. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.13006v1-abstract-full').style.display = 'none'; document.getElementById('2407.13006v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.11245">arXiv:2407.11245</a> <span> [<a href="https://arxiv.org/pdf/2407.11245">pdf</a>, <a href="https://arxiv.org/format/2407.11245">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Pacer and Runner: Cooperative Learning Framework between Single- and Cross-Domain Sequential Recommendation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Park%2C+C">Chung Park</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+T">Taesan Kim</a>, <a href="/search/cs?searchtype=author&query=Yoon%2C+H">Hyungjun Yoon</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+J">Junui Hong</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Y">Yelim Yu</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+M">Mincheol Cho</a>, <a href="/search/cs?searchtype=author&query=Choi%2C+M">Minsung Choi</a>, <a href="/search/cs?searchtype=author&query=Choo%2C+J">Jaegul Choo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.11245v2-abstract-short" style="display: inline;"> Cross-Domain Sequential Recommendation (CDSR) improves recommendation performance by utilizing information from multiple domains, which contrasts with Single-Domain Sequential Recommendation (SDSR) that relies on a historical interaction within a specific domain. However, CDSR may underperform compared to the SDSR approach in certain domains due to negative transfer, which occurs when there is a l… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.11245v2-abstract-full').style.display = 'inline'; document.getElementById('2407.11245v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.11245v2-abstract-full" style="display: none;"> Cross-Domain Sequential Recommendation (CDSR) improves recommendation performance by utilizing information from multiple domains, which contrasts with Single-Domain Sequential Recommendation (SDSR) that relies on a historical interaction within a specific domain. However, CDSR may underperform compared to the SDSR approach in certain domains due to negative transfer, which occurs when there is a lack of relation between domains or different levels of data sparsity. To address the issue of negative transfer, our proposed CDSR model estimates the degree of negative transfer of each domain and adaptively assigns it as a weight factor to the prediction loss, to control gradient flows through domains with significant negative transfer. To this end, our model compares the performance of a model trained on multiple domains (CDSR) with a model trained solely on the specific domain (SDSR) to evaluate the negative transfer of each domain using our asymmetric cooperative network. In addition, to facilitate the transfer of valuable cues between the SDSR and CDSR tasks, we developed an auxiliary loss that maximizes the mutual information between the representation pairs from both tasks on a per-domain basis. This cooperative learning between SDSR and CDSR tasks is similar to the collaborative dynamics between pacers and runners in a marathon. Our model outperformed numerous previous works in extensive experiments on two real-world industrial datasets across ten service domains. We also have deployed our model in the recommendation system of our personal assistant app service, resulting in 21.4% increase in click-through rate compared to existing models, which is valuable to real-world business. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.11245v2-abstract-full').style.display = 'none'; document.getElementById('2407.11245v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at SIGIR'24 (Best Paper Honorable Mention)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.10542">arXiv:2407.10542</a> <span> [<a href="https://arxiv.org/pdf/2407.10542">pdf</a>, <a href="https://arxiv.org/format/2407.10542">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> 3D Geometric Shape Assembly via Efficient Point Cloud Matching </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lee%2C+N">Nahyuk Lee</a>, <a href="/search/cs?searchtype=author&query=Min%2C+J">Juhong Min</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+J">Junha Lee</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+S">Seungwook Kim</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+K">Kanghee Lee</a>, <a href="/search/cs?searchtype=author&query=Park%2C+J">Jaesik Park</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+M">Minsu Cho</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.10542v1-abstract-short" style="display: inline;"> Learning to assemble geometric shapes into a larger target structure is a pivotal task in various practical applications. In this work, we tackle this problem by establishing local correspondences between point clouds of part shapes in both coarse- and fine-levels. To this end, we introduce Proxy Match Transform (PMT), an approximate high-order feature transform layer that enables reliable matchin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.10542v1-abstract-full').style.display = 'inline'; document.getElementById('2407.10542v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.10542v1-abstract-full" style="display: none;"> Learning to assemble geometric shapes into a larger target structure is a pivotal task in various practical applications. In this work, we tackle this problem by establishing local correspondences between point clouds of part shapes in both coarse- and fine-levels. To this end, we introduce Proxy Match Transform (PMT), an approximate high-order feature transform layer that enables reliable matching between mating surfaces of parts while incurring low costs in memory and computation. Building upon PMT, we introduce a new framework, dubbed Proxy Match TransformeR (PMTR), for the geometric assembly task. We evaluate the proposed PMTR on the large-scale 3D geometric shape assembly benchmark dataset of Breaking Bad and demonstrate its superior performance and efficiency compared to state-of-the-art methods. Project page: https://nahyuklee.github.io/pmtr. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.10542v1-abstract-full').style.display = 'none'; document.getElementById('2407.10542v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ICML 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.01158">arXiv:2407.01158</a> <span> [<a href="https://arxiv.org/pdf/2407.01158">pdf</a>, <a href="https://arxiv.org/format/2407.01158">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Learning to Explore and Select for Coverage-Conditioned Retrieval-Augmented Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kim%2C+T">Takyoung Kim</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+K">Kyungjae Lee</a>, <a href="/search/cs?searchtype=author&query=Jang%2C+Y+R">Young Rok Jang</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+J+Y">Ji Yong Cho</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+G">Gangwoo Kim</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+M">Minseok Cho</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+M">Moontae Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.01158v2-abstract-short" style="display: inline;"> Interactions with large language models (LLMs) often yield long and detailed responses, leveraging both parametric knowledge and retrieval-augmented generation (RAG). While these responses can provide rich insights, they often include redundant or less engaging content not aligned with user interests. This issue becomes apparent when users specify particular subtopics to include or exclude -- term… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.01158v2-abstract-full').style.display = 'inline'; document.getElementById('2407.01158v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.01158v2-abstract-full" style="display: none;"> Interactions with large language models (LLMs) often yield long and detailed responses, leveraging both parametric knowledge and retrieval-augmented generation (RAG). While these responses can provide rich insights, they often include redundant or less engaging content not aligned with user interests. This issue becomes apparent when users specify particular subtopics to include or exclude -- termed coverage-conditioned ($C^2$) queries -- as LLMs often struggle to provide tailored responses. To address this challenge, we investigate the role of query outlines, sequences of subqueries designed to guide LLMs in generating responses that meet specific user requirements. To systematically create and evaluate these outlines, we introduce QTree, a dataset of 10K hierarchical sets of information-seeking subqueries that define structured boundaries for outline creation and evaluation in $C^2$ scenarios. Additionally, we develop QPlanner, a 7B language model trained to generate customized outlines within boundaries of QTree. We evaluate the effectiveness of the generated outlines through automatic and human judgements, focusing on their impact within retrieval-augmented generation (RAG) systems. Experimental results demonstrate that QPlanner, especially when trained with alignment techniques like DPO, generates higher-quality outlines that better fulfill diverse user needs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.01158v2-abstract-full').style.display = 'none'; document.getElementById('2407.01158v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NAACL 2025 (Findings, Long). Resources are available at https://github.com/youngerous/qtree</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.17869">arXiv:2406.17869</a> <span> [<a href="https://arxiv.org/pdf/2406.17869">pdf</a>, <a href="https://arxiv.org/format/2406.17869">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Burst Image Super-Resolution with Base Frame Selection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kim%2C+S">Sanghyun Kim</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+M+J">Min Jung Lee</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+W">Woohyeok Kim</a>, <a href="/search/cs?searchtype=author&query=Jung%2C+D">Deunsol Jung</a>, <a href="/search/cs?searchtype=author&query=Rim%2C+J">Jaesung Rim</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+S">Sunghyun Cho</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+M">Minsu Cho</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.17869v1-abstract-short" style="display: inline;"> Burst image super-resolution has been a topic of active research in recent years due to its ability to obtain a high-resolution image by using complementary information between multiple frames in the burst. In this work, we explore using burst shots with non-uniform exposures to confront real-world practical scenarios by introducing a new benchmark dataset, dubbed Non-uniformly Exposed Burst Image… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.17869v1-abstract-full').style.display = 'inline'; document.getElementById('2406.17869v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.17869v1-abstract-full" style="display: none;"> Burst image super-resolution has been a topic of active research in recent years due to its ability to obtain a high-resolution image by using complementary information between multiple frames in the burst. In this work, we explore using burst shots with non-uniform exposures to confront real-world practical scenarios by introducing a new benchmark dataset, dubbed Non-uniformly Exposed Burst Image (NEBI), that includes the burst frames at varying exposure times to obtain a broader range of irradiance and motion characteristics within a scene. As burst shots with non-uniform exposures exhibit varying levels of degradation, fusing information of the burst shots into the first frame as a base frame may not result in optimal image quality. To address this limitation, we propose a Frame Selection Network (FSN) for non-uniform scenarios. This network seamlessly integrates into existing super-resolution methods in a plug-and-play manner with low computational costs. The comparative analysis reveals the effectiveness of the nonuniform setting for the practical scenario and our FSN on synthetic-/real- NEBI datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.17869v1-abstract-full').style.display = 'none'; document.getElementById('2406.17869v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CVPR2024W NTIRE accepted</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.05329">arXiv:2405.05329</a> <span> [<a href="https://arxiv.org/pdf/2405.05329">pdf</a>, <a href="https://arxiv.org/format/2405.05329">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> KV-Runahead: Scalable Causal LLM Inference by Parallel Key-Value Cache Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cho%2C+M">Minsik Cho</a>, <a href="/search/cs?searchtype=author&query=Rastegari%2C+M">Mohammad Rastegari</a>, <a href="/search/cs?searchtype=author&query=Naik%2C+D">Devang Naik</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.05329v2-abstract-short" style="display: inline;"> Large Language Model or LLM inference has two phases, the prompt (or prefill) phase to output the first token and the extension (or decoding) phase to the generate subsequent tokens. In this work, we propose an efficient parallelization scheme, KV-Runahead to accelerate the prompt phase. The key observation is that the extension phase generates tokens faster than the prompt phase because of key-va… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.05329v2-abstract-full').style.display = 'inline'; document.getElementById('2405.05329v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.05329v2-abstract-full" style="display: none;"> Large Language Model or LLM inference has two phases, the prompt (or prefill) phase to output the first token and the extension (or decoding) phase to the generate subsequent tokens. In this work, we propose an efficient parallelization scheme, KV-Runahead to accelerate the prompt phase. The key observation is that the extension phase generates tokens faster than the prompt phase because of key-value cache (KV-cache). Hence, KV-Runahead parallelizes the prompt phase by orchestrating multiple processes to populate the KV-cache and minimizes the time-to-first-token (TTFT). Dual-purposing the KV-cache scheme has two main benefits. First, since KV-cache is designed to leverage the causal attention map, we minimize computation and computation automatically. Second, since it already exists for the extension phase, KV-Runahead is easy to implement. We further propose context-level load-balancing to handle uneven KV-cache generation (due to the causal attention) and to optimize TTFT. Compared with an existing parallelization scheme such as tensor or sequential parallelization where keys and values are locally generated and exchanged via all-gather collectives, our experimental results demonstrate that KV-Runahead can offer over 1.4x and 1.6x speedups for Llama 7B and Falcon 7B respectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.05329v2-abstract-full').style.display = 'none'; document.getElementById('2405.05329v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">preprint for ICML 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.03892">arXiv:2405.03892</a> <span> [<a href="https://arxiv.org/pdf/2405.03892">pdf</a>, <a href="https://arxiv.org/format/2405.03892">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Out-of-Distribution Adaptation in Offline RL: Counterfactual Reasoning via Causal Normalizing Flows </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cho%2C+M">Minjae Cho</a>, <a href="/search/cs?searchtype=author&query=How%2C+J+P">Jonathan P. How</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+C">Chuangchuang Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.03892v1-abstract-short" style="display: inline;"> Despite notable successes of Reinforcement Learning (RL), the prevalent use of an online learning paradigm prevents its widespread adoption, especially in hazardous or costly scenarios. Offline RL has emerged as an alternative solution, learning from pre-collected static datasets. However, this offline learning introduces a new challenge known as distributional shift, degrading the performance whe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.03892v1-abstract-full').style.display = 'inline'; document.getElementById('2405.03892v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.03892v1-abstract-full" style="display: none;"> Despite notable successes of Reinforcement Learning (RL), the prevalent use of an online learning paradigm prevents its widespread adoption, especially in hazardous or costly scenarios. Offline RL has emerged as an alternative solution, learning from pre-collected static datasets. However, this offline learning introduces a new challenge known as distributional shift, degrading the performance when the policy is evaluated on scenarios that are Out-Of-Distribution (OOD) from the training dataset. Most existing offline RL resolves this issue by regularizing policy learning within the information supported by the given dataset. However, such regularization overlooks the potential for high-reward regions that may exist beyond the dataset. This motivates exploring novel offline learning techniques that can make improvements beyond the data support without compromising policy performance, potentially by learning causation (cause-and-effect) instead of correlation from the dataset. In this paper, we propose the MOOD-CRL (Model-based Offline OOD-Adapting Causal RL) algorithm, which aims to address the challenge of extrapolation for offline policy training through causal inference instead of policy-regularizing methods. Specifically, Causal Normalizing Flow (CNF) is developed to learn the transition and reward functions for data generation and augmentation in offline policy evaluation and training. Based on the data-invariant, physics-based qualitative causal graph and the observational data, we develop a novel learning scheme for CNF to learn the quantitative structural causal model. As a result, CNF gains predictive and counterfactual reasoning capabilities for sequential decision-making tasks, revealing a high potential for OOD adaptation. Our CNF-based offline RL approach is validated through empirical evaluations, outperforming model-free and model-based methods by a significant margin. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.03892v1-abstract-full').style.display = 'none'; document.getElementById('2405.03892v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted for review at IEEE: Neural Networks and Learning Systems</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.17419">arXiv:2404.17419</a> <span> [<a href="https://arxiv.org/pdf/2404.17419">pdf</a>, <a href="https://arxiv.org/format/2404.17419">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Multi-view Image Prompted Multi-view Diffusion for Improved 3D Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kim%2C+S">Seungwook Kim</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yichun Shi</a>, <a href="/search/cs?searchtype=author&query=Li%2C+K">Kejie Li</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+M">Minsu Cho</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+P">Peng Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.17419v1-abstract-short" style="display: inline;"> Using image as prompts for 3D generation demonstrate particularly strong performances compared to using text prompts alone, for images provide a more intuitive guidance for the 3D generation process. In this work, we delve into the potential of using multiple image prompts, instead of a single image prompt, for 3D generation. Specifically, we build on ImageDream, a novel image-prompt multi-view di… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.17419v1-abstract-full').style.display = 'inline'; document.getElementById('2404.17419v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.17419v1-abstract-full" style="display: none;"> Using image as prompts for 3D generation demonstrate particularly strong performances compared to using text prompts alone, for images provide a more intuitive guidance for the 3D generation process. In this work, we delve into the potential of using multiple image prompts, instead of a single image prompt, for 3D generation. Specifically, we build on ImageDream, a novel image-prompt multi-view diffusion model, to support multi-view images as the input prompt. Our method, dubbed MultiImageDream, reveals that transitioning from a single-image prompt to multiple-image prompts enhances the performance of multi-view and 3D object generation according to various quantitative evaluation metrics and qualitative assessments. This advancement is achieved without the necessity of fine-tuning the pre-trained ImageDream multi-view diffusion model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.17419v1-abstract-full').style.display = 'none'; document.getElementById('2404.17419v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages including references, 2 figures, 2 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.11156">arXiv:2404.11156</a> <span> [<a href="https://arxiv.org/pdf/2404.11156">pdf</a>, <a href="https://arxiv.org/ps/2404.11156">ps</a>, <a href="https://arxiv.org/format/2404.11156">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Learning SO(3)-Invariant Semantic Correspondence via Local Shape Transform </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Park%2C+C">Chunghyun Park</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+S">Seungwook Kim</a>, <a href="/search/cs?searchtype=author&query=Park%2C+J">Jaesik Park</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+M">Minsu Cho</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.11156v2-abstract-short" style="display: inline;"> Establishing accurate 3D correspondences between shapes stands as a pivotal challenge with profound implications for computer vision and robotics. However, existing self-supervised methods for this problem assume perfect input shape alignment, restricting their real-world applicability. In this work, we introduce a novel self-supervised Rotation-Invariant 3D correspondence learner with Local Shape… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.11156v2-abstract-full').style.display = 'inline'; document.getElementById('2404.11156v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.11156v2-abstract-full" style="display: none;"> Establishing accurate 3D correspondences between shapes stands as a pivotal challenge with profound implications for computer vision and robotics. However, existing self-supervised methods for this problem assume perfect input shape alignment, restricting their real-world applicability. In this work, we introduce a novel self-supervised Rotation-Invariant 3D correspondence learner with Local Shape Transform, dubbed RIST, that learns to establish dense correspondences between shapes even under challenging intra-class variations and arbitrary orientations. Specifically, RIST learns to dynamically formulate an SO(3)-invariant local shape transform for each point, which maps the SO(3)-equivariant global shape descriptor of the input shape to a local shape descriptor. These local shape descriptors are provided as inputs to our decoder to facilitate point cloud self- and cross-reconstruction. Our proposed self-supervised training pipeline encourages semantically corresponding points from different shapes to be mapped to similar local shape descriptors, enabling RIST to establish dense point-wise correspondences. RIST demonstrates state-of-the-art performances on 3D part label transfer and semantic keypoint transfer given arbitrarily rotated point cloud pairs, outperforming existing methods by significant margins. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.11156v2-abstract-full').style.display = 'none'; document.getElementById('2404.11156v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to CVPR 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.10603">arXiv:2404.10603</a> <span> [<a href="https://arxiv.org/pdf/2404.10603">pdf</a>, <a href="https://arxiv.org/format/2404.10603">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> CorrespondentDream: Enhancing 3D Fidelity of Text-to-3D using Cross-View Correspondences </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kim%2C+S">Seungwook Kim</a>, <a href="/search/cs?searchtype=author&query=Li%2C+K">Kejie Li</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+X">Xueqing Deng</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yichun Shi</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+M">Minsu Cho</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+P">Peng Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.10603v2-abstract-short" style="display: inline;"> Leveraging multi-view diffusion models as priors for 3D optimization have alleviated the problem of 3D consistency, e.g., the Janus face problem or the content drift problem, in zero-shot text-to-3D models. However, the 3D geometric fidelity of the output remains an unresolved issue; albeit the rendered 2D views are realistic, the underlying geometry may contain errors such as unreasonable concavi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.10603v2-abstract-full').style.display = 'inline'; document.getElementById('2404.10603v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.10603v2-abstract-full" style="display: none;"> Leveraging multi-view diffusion models as priors for 3D optimization have alleviated the problem of 3D consistency, e.g., the Janus face problem or the content drift problem, in zero-shot text-to-3D models. However, the 3D geometric fidelity of the output remains an unresolved issue; albeit the rendered 2D views are realistic, the underlying geometry may contain errors such as unreasonable concavities. In this work, we propose CorrespondentDream, an effective method to leverage annotation-free, cross-view correspondences yielded from the diffusion U-Net to provide additional 3D prior to the NeRF optimization process. We find that these correspondences are strongly consistent with human perception, and by adopting it in our loss design, we are able to produce NeRF models with geometries that are more coherent with common sense, e.g., more smoothed object surface, yielding higher 3D fidelity. We demonstrate the efficacy of our approach through various comparative qualitative results and a solid user study. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.10603v2-abstract-full').style.display = 'none'; document.getElementById('2404.10603v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">25 pages, 22 figures, accepted to CVPR 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.09451">arXiv:2404.09451</a> <span> [<a href="https://arxiv.org/pdf/2404.09451">pdf</a>, <a href="https://arxiv.org/format/2404.09451">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Contrastive Mean-Shift Learning for Generalized Category Discovery </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Choi%2C+S">Sua Choi</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+D">Dahyun Kang</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+M">Minsu Cho</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.09451v1-abstract-short" style="display: inline;"> We address the problem of generalized category discovery (GCD) that aims to partition a partially labeled collection of images; only a small part of the collection is labeled and the total number of target classes is unknown. To address this generalized image clustering problem, we revisit the mean-shift algorithm, i.e., a classic, powerful technique for mode seeking, and incorporate it into a con… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.09451v1-abstract-full').style.display = 'inline'; document.getElementById('2404.09451v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.09451v1-abstract-full" style="display: none;"> We address the problem of generalized category discovery (GCD) that aims to partition a partially labeled collection of images; only a small part of the collection is labeled and the total number of target classes is unknown. To address this generalized image clustering problem, we revisit the mean-shift algorithm, i.e., a classic, powerful technique for mode seeking, and incorporate it into a contrastive learning framework. The proposed method, dubbed Contrastive Mean-Shift (CMS) learning, trains an image encoder to produce representations with better clustering properties by an iterative process of mean shift and contrastive update. Experiments demonstrate that our method, both in settings with and without the total number of clusters being known, achieves state-of-the-art performance on six public GCD benchmarks without bells and whistles. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.09451v1-abstract-full').style.display = 'none'; document.getElementById('2404.09451v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at CVPR 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.06511">arXiv:2404.06511</a> <span> [<a href="https://arxiv.org/pdf/2404.06511">pdf</a>, <a href="https://arxiv.org/format/2404.06511">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> MoReVQA: Exploring Modular Reasoning Models for Video Question Answering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Min%2C+J">Juhong Min</a>, <a href="/search/cs?searchtype=author&query=Buch%2C+S">Shyamal Buch</a>, <a href="/search/cs?searchtype=author&query=Nagrani%2C+A">Arsha Nagrani</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+M">Minsu Cho</a>, <a href="/search/cs?searchtype=author&query=Schmid%2C+C">Cordelia Schmid</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.06511v1-abstract-short" style="display: inline;"> This paper addresses the task of video question answering (videoQA) via a decomposed multi-stage, modular reasoning framework. Previous modular methods have shown promise with a single planning stage ungrounded in visual content. However, through a simple and effective baseline, we find that such systems can lead to brittle behavior in practice for challenging videoQA settings. Thus, unlike tradit… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.06511v1-abstract-full').style.display = 'inline'; document.getElementById('2404.06511v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.06511v1-abstract-full" style="display: none;"> This paper addresses the task of video question answering (videoQA) via a decomposed multi-stage, modular reasoning framework. Previous modular methods have shown promise with a single planning stage ungrounded in visual content. However, through a simple and effective baseline, we find that such systems can lead to brittle behavior in practice for challenging videoQA settings. Thus, unlike traditional single-stage planning methods, we propose a multi-stage system consisting of an event parser, a grounding stage, and a final reasoning stage in conjunction with an external memory. All stages are training-free, and performed using few-shot prompting of large models, creating interpretable intermediate outputs at each stage. By decomposing the underlying planning and task complexity, our method, MoReVQA, improves over prior work on standard videoQA benchmarks (NExT-QA, iVQA, EgoSchema, ActivityNet-QA) with state-of-the-art results, and extensions to related tasks (grounded videoQA, paragraph captioning). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.06511v1-abstract-full').style.display = 'none'; document.getElementById('2404.06511v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CVPR 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.03924">arXiv:2404.03924</a> <span> [<a href="https://arxiv.org/pdf/2404.03924">pdf</a>, <a href="https://arxiv.org/format/2404.03924">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Learning Correlation Structures for Vision Transformers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kim%2C+M">Manjin Kim</a>, <a href="/search/cs?searchtype=author&query=Seo%2C+P+H">Paul Hongsuck Seo</a>, <a href="/search/cs?searchtype=author&query=Schmid%2C+C">Cordelia Schmid</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+M">Minsu Cho</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.03924v1-abstract-short" style="display: inline;"> We introduce a new attention mechanism, dubbed structural self-attention (StructSA), that leverages rich correlation patterns naturally emerging in key-query interactions of attention. StructSA generates attention maps by recognizing space-time structures of key-query correlations via convolution and uses them to dynamically aggregate local contexts of value features. This effectively leverages ri… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.03924v1-abstract-full').style.display = 'inline'; document.getElementById('2404.03924v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.03924v1-abstract-full" style="display: none;"> We introduce a new attention mechanism, dubbed structural self-attention (StructSA), that leverages rich correlation patterns naturally emerging in key-query interactions of attention. StructSA generates attention maps by recognizing space-time structures of key-query correlations via convolution and uses them to dynamically aggregate local contexts of value features. This effectively leverages rich structural patterns in images and videos such as scene layouts, object motion, and inter-object relations. Using StructSA as a main building block, we develop the structural vision transformer (StructViT) and evaluate its effectiveness on both image and video classification tasks, achieving state-of-the-art results on ImageNet-1K, Kinetics-400, Something-Something V1 & V2, Diving-48, and FineGym. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.03924v1-abstract-full').style.display = 'none'; document.getElementById('2404.03924v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to CVPR 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.01842">arXiv:2404.01842</a> <span> [<a href="https://arxiv.org/pdf/2404.01842">pdf</a>, <a href="https://arxiv.org/format/2404.01842">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Semi-Supervised Domain Adaptation for Wildfire Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jang%2C+J">JooYoung Jang</a>, <a href="/search/cs?searchtype=author&query=Cha%2C+Y">Youngseo Cha</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+J">Jisu Kim</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+S">SooHyung Lee</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+G">Geonu Lee</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+M">Minkook Cho</a>, <a href="/search/cs?searchtype=author&query=Hwang%2C+Y">Young Hwang</a>, <a href="/search/cs?searchtype=author&query=Kwak%2C+N">Nojun Kwak</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.01842v1-abstract-short" style="display: inline;"> Recently, both the frequency and intensity of wildfires have increased worldwide, primarily due to climate change. In this paper, we propose a novel protocol for wildfire detection, leveraging semi-supervised Domain Adaptation for object detection, accompanied by a corresponding dataset designed for use by both academics and industries. Our dataset encompasses 30 times more diverse labeled scenes… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.01842v1-abstract-full').style.display = 'inline'; document.getElementById('2404.01842v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.01842v1-abstract-full" style="display: none;"> Recently, both the frequency and intensity of wildfires have increased worldwide, primarily due to climate change. In this paper, we propose a novel protocol for wildfire detection, leveraging semi-supervised Domain Adaptation for object detection, accompanied by a corresponding dataset designed for use by both academics and industries. Our dataset encompasses 30 times more diverse labeled scenes for the current largest benchmark wildfire dataset, HPWREN, and introduces a new labeling policy for wildfire detection. Inspired by CoordConv, we propose a robust baseline, Location-Aware Object Detection for Semi-Supervised Domain Adaptation (LADA), utilizing a teacher-student based framework capable of extracting translational variance features characteristic of wildfires. With only using 1% target domain labeled data, our framework significantly outperforms our source-only baseline by a notable margin of 3.8% in mean Average Precision on the HPWREN wildfire dataset. Our dataset is available at https://github.com/BloomBerry/LADA. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.01842v1-abstract-full').style.display = 'none'; document.getElementById('2404.01842v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">16 pages, 5 figures, 22 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.00060">arXiv:2404.00060</a> <span> [<a href="https://arxiv.org/pdf/2404.00060">pdf</a>, <a href="https://arxiv.org/format/2404.00060">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Statistical Finance">q-fin.ST</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Temporal Graph Networks for Graph Anomaly Detection in Financial Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kim%2C+Y">Yejin Kim</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+Y">Youngbin Lee</a>, <a href="/search/cs?searchtype=author&query=Choe%2C+M">Minyoung Choe</a>, <a href="/search/cs?searchtype=author&query=Oh%2C+S">Sungju Oh</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+Y">Yongjae Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.00060v1-abstract-short" style="display: inline;"> This paper explores the utilization of Temporal Graph Networks (TGN) for financial anomaly detection, a pressing need in the era of fintech and digitized financial transactions. We present a comprehensive framework that leverages TGN, capable of capturing dynamic changes in edges within financial networks, for fraud detection. Our study compares TGN's performance against static Graph Neural Networ… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.00060v1-abstract-full').style.display = 'inline'; document.getElementById('2404.00060v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.00060v1-abstract-full" style="display: none;"> This paper explores the utilization of Temporal Graph Networks (TGN) for financial anomaly detection, a pressing need in the era of fintech and digitized financial transactions. We present a comprehensive framework that leverages TGN, capable of capturing dynamic changes in edges within financial networks, for fraud detection. Our study compares TGN's performance against static Graph Neural Network (GNN) baselines, as well as cutting-edge hypergraph neural network baselines using DGraph dataset for a realistic financial context. Our results demonstrate that TGN significantly outperforms other models in terms of AUC metrics. This superior performance underlines TGN's potential as an effective tool for detecting financial fraud, showcasing its ability to adapt to the dynamic and complex nature of modern financial systems. We also experimented with various graph embedding modules within the TGN framework and compared the effectiveness of each module. In conclusion, we demonstrated that, even with variations within TGN, it is possible to achieve good performance in the anomaly detection task. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.00060v1-abstract-full').style.display = 'none'; document.getElementById('2404.00060v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Presented at the AAAI 2024 Workshop on AI in Finance for Social Impact (https://sites.google.com/view/aifin-aaai2024)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.11477">arXiv:2402.11477</a> <span> [<a href="https://arxiv.org/pdf/2402.11477">pdf</a>, <a href="https://arxiv.org/format/2402.11477">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> Cross-Cultural Differences in Mental Health Expressions on Social Media </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Rai%2C+S">Sunny Rai</a>, <a href="/search/cs?searchtype=author&query=Shelat%2C+K">Khushi Shelat</a>, <a href="/search/cs?searchtype=author&query=Jain%2C+D+R">Devansh R Jain</a>, <a href="/search/cs?searchtype=author&query=Sivabalan%2C+K">Kishen Sivabalan</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+Y+M">Young Min Cho</a>, <a href="/search/cs?searchtype=author&query=Redkar%2C+M">Maitreyi Redkar</a>, <a href="/search/cs?searchtype=author&query=Sawant%2C+S">Samindara Sawant</a>, <a href="/search/cs?searchtype=author&query=Ungar%2C+L+H">Lyle H. Ungar</a>, <a href="/search/cs?searchtype=author&query=Guntuku%2C+S+C">Sharath Chandra Guntuku</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.11477v4-abstract-short" style="display: inline;"> Culture moderates the way individuals perceive and express mental distress. Current understandings of mental health expressions on social media, however, are predominantly derived from WEIRD (Western, Educated, Industrialized, Rich, and Democratic) contexts. To address this gap, we examine mental health posts on Reddit made by individuals geolocated in India, to identify variations in social media… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.11477v4-abstract-full').style.display = 'inline'; document.getElementById('2402.11477v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.11477v4-abstract-full" style="display: none;"> Culture moderates the way individuals perceive and express mental distress. Current understandings of mental health expressions on social media, however, are predominantly derived from WEIRD (Western, Educated, Industrialized, Rich, and Democratic) contexts. To address this gap, we examine mental health posts on Reddit made by individuals geolocated in India, to identify variations in social media language specific to the Indian context compared to users from Western nations. Our experiments reveal significant psychosocial variations in emotions and temporal orientation. This study demonstrates the potential of social media platforms for identifying cross-cultural differences in mental health expressions (e.g. seeking advice in India vs seeking support by Western users). Significant linguistic variations in online mental health-related language emphasize the importance of developing precision-targeted interventions that are culturally appropriate. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.11477v4-abstract-full').style.display = 'none'; document.getElementById('2402.11477v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.05659">arXiv:2401.05659</a> <span> [<a href="https://arxiv.org/pdf/2401.05659">pdf</a>, <a href="https://arxiv.org/format/2401.05659">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> Engineering Adaptive Information Graphics for Disabled Communities: A Case Study with Public Space Indoor Maps </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Madugalla%2C+A">Anuradha Madugalla</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yutan Huang</a>, <a href="/search/cs?searchtype=author&query=Grundy%2C+J">John Grundy</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+M+H">Min Hee Cho</a>, <a href="/search/cs?searchtype=author&query=Gamage%2C+L+K">Lasith Koswatta Gamage</a>, <a href="/search/cs?searchtype=author&query=Leao%2C+T">Tristan Leao</a>, <a href="/search/cs?searchtype=author&query=Thiele%2C+S">Sam Thiele</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.05659v1-abstract-short" style="display: inline;"> Most software applications contain graphics such as charts, diagrams and maps. Currently, these graphics are designed with a ``one size fits all" approach and do not cater to the needs of people with disabilities. Therefore, when using software with graphics, a colour-impaired user may struggle to interpret graphics with certain colours, and a person with dyslexia may struggle to read the text lab… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.05659v1-abstract-full').style.display = 'inline'; document.getElementById('2401.05659v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.05659v1-abstract-full" style="display: none;"> Most software applications contain graphics such as charts, diagrams and maps. Currently, these graphics are designed with a ``one size fits all" approach and do not cater to the needs of people with disabilities. Therefore, when using software with graphics, a colour-impaired user may struggle to interpret graphics with certain colours, and a person with dyslexia may struggle to read the text labels in the graphic. Our research addresses this issue by developing a framework that generates adaptive and accessible information graphics for multiple disabilities. Uniquely, the approach also serves people with multiple simultaneous disabilities. To achieve these, we used a case study of public space floorplans presented via a web tool and worked with four disability groups: people with low vision, colour blindness, dyslexia and mobility impairment. Our research involved gathering requirements from 3 accessibility experts and 80 participants with disabilities, developing a system to generate adaptive graphics that address the identified requirements, and conducting an evaluation with 7 participants with disabilities. The evaluation showed that users found our solution easy to use and suitable for most of their requirements. The study also provides recommendations for front-end developers on engineering accessible graphics for their software and discusses the implications of our work on society from the perspective of public space owners and end users. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.05659v1-abstract-full').style.display = 'none'; document.getElementById('2401.05659v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.11514">arXiv:2312.11514</a> <span> [<a href="https://arxiv.org/pdf/2312.11514">pdf</a>, <a href="https://arxiv.org/format/2312.11514">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> LLM in a flash: Efficient Large Language Model Inference with Limited Memory </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Alizadeh%2C+K">Keivan Alizadeh</a>, <a href="/search/cs?searchtype=author&query=Mirzadeh%2C+I">Iman Mirzadeh</a>, <a href="/search/cs?searchtype=author&query=Belenko%2C+D">Dmitry Belenko</a>, <a href="/search/cs?searchtype=author&query=Khatamifard%2C+K">Karen Khatamifard</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+M">Minsik Cho</a>, <a href="/search/cs?searchtype=author&query=Del+Mundo%2C+C+C">Carlo C Del Mundo</a>, <a href="/search/cs?searchtype=author&query=Rastegari%2C+M">Mohammad Rastegari</a>, <a href="/search/cs?searchtype=author&query=Farajtabar%2C+M">Mehrdad Farajtabar</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.11514v3-abstract-short" style="display: inline;"> Large language models (LLMs) are central to modern natural language processing, delivering exceptional performance in various tasks. However, their substantial computational and memory requirements present challenges, especially for devices with limited DRAM capacity. This paper tackles the challenge of efficiently running LLMs that exceed the available DRAM capacity by storing the model parameter… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.11514v3-abstract-full').style.display = 'inline'; document.getElementById('2312.11514v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.11514v3-abstract-full" style="display: none;"> Large language models (LLMs) are central to modern natural language processing, delivering exceptional performance in various tasks. However, their substantial computational and memory requirements present challenges, especially for devices with limited DRAM capacity. This paper tackles the challenge of efficiently running LLMs that exceed the available DRAM capacity by storing the model parameters in flash memory, but bringing them on demand to DRAM. Our method involves constructing an inference cost model that takes into account the characteristics of flash memory, guiding us to optimize in two critical areas: reducing the volume of data transferred from flash and reading data in larger, more contiguous chunks. Within this hardware-informed framework, we introduce two principal techniques. First, "windowing" strategically reduces data transfer by reusing previously activated neurons, and second, "row-column bundling", tailored to the sequential data access strengths of flash memory, increases the size of data chunks read from flash memory. These methods collectively enable running models up to twice the size of the available DRAM, with a 4-5x and 20-25x increase in inference speed compared to naive loading approaches in CPU and GPU, respectively. Our integration of sparsity awareness, context-adaptive loading, and a hardware-oriented design paves the way for effective inference of LLMs on devices with limited memory. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.11514v3-abstract-full').style.display = 'none'; document.getElementById('2312.11514v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ACL 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.10230">arXiv:2312.10230</a> <span> [<a href="https://arxiv.org/pdf/2312.10230">pdf</a>, <a href="https://arxiv.org/format/2312.10230">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Constrained Meta-Reinforcement Learning for Adaptable Safety Guarantee with Differentiable Convex Programming </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cho%2C+M">Minjae Cho</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+C">Chuangchuang Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.10230v1-abstract-short" style="display: inline;"> Despite remarkable achievements in artificial intelligence, the deployability of learning-enabled systems in high-stakes real-world environments still faces persistent challenges. For example, in safety-critical domains like autonomous driving, robotic manipulation, and healthcare, it is crucial not only to achieve high performance but also to comply with given constraints. Furthermore, adaptabili… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.10230v1-abstract-full').style.display = 'inline'; document.getElementById('2312.10230v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.10230v1-abstract-full" style="display: none;"> Despite remarkable achievements in artificial intelligence, the deployability of learning-enabled systems in high-stakes real-world environments still faces persistent challenges. For example, in safety-critical domains like autonomous driving, robotic manipulation, and healthcare, it is crucial not only to achieve high performance but also to comply with given constraints. Furthermore, adaptability becomes paramount in non-stationary domains, where environmental parameters are subject to change. While safety and adaptability are recognized as key qualities for the new generation of AI, current approaches have not demonstrated effective adaptable performance in constrained settings. Hence, this paper breaks new ground by studying the unique challenges of ensuring safety in non-stationary environments by solving constrained problems through the lens of the meta-learning approach (learning-to-learn). While unconstrained meta-learning al-ready encounters complexities in end-to-end differentiation of the loss due to the bi-level nature, its constrained counterpart introduces an additional layer of difficulty, since the constraints imposed on task-level updates complicate the differentiation process. To address the issue, we first employ successive convex-constrained policy updates across multiple tasks with differentiable convexprogramming, which allows meta-learning in constrained scenarios by enabling end-to-end differentiation. This approach empowers the agent to rapidly adapt to new tasks under non-stationarity while ensuring compliance with safety constraints. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.10230v1-abstract-full').style.display = 'none'; document.getElementById('2312.10230v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.07315">arXiv:2312.07315</a> <span> [<a href="https://arxiv.org/pdf/2312.07315">pdf</a>, <a href="https://arxiv.org/format/2312.07315">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> NVS-Adapter: Plug-and-Play Novel View Synthesis from a Single Image </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jeong%2C+Y">Yoonwoo Jeong</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+J">Jinwoo Lee</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+C">Chiheon Kim</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+M">Minsu Cho</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+D">Doyup Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.07315v2-abstract-short" style="display: inline;"> Transfer learning of large-scale Text-to-Image (T2I) models has recently shown impressive potential for Novel View Synthesis (NVS) of diverse objects from a single image. While previous methods typically train large models on multi-view datasets for NVS, fine-tuning the whole parameters of T2I models not only demands a high cost but also reduces the generalization capacity of T2I models in generat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.07315v2-abstract-full').style.display = 'inline'; document.getElementById('2312.07315v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.07315v2-abstract-full" style="display: none;"> Transfer learning of large-scale Text-to-Image (T2I) models has recently shown impressive potential for Novel View Synthesis (NVS) of diverse objects from a single image. While previous methods typically train large models on multi-view datasets for NVS, fine-tuning the whole parameters of T2I models not only demands a high cost but also reduces the generalization capacity of T2I models in generating diverse images in a new domain. In this study, we propose an effective method, dubbed NVS-Adapter, which is a plug-and-play module for a T2I model, to synthesize novel multi-views of visual objects while fully exploiting the generalization capacity of T2I models. NVS-Adapter consists of two main components; view-consistency cross-attention learns the visual correspondences to align the local details of view features, and global semantic conditioning aligns the semantic structure of generated views with the reference view. Experimental results demonstrate that the NVS-Adapter can effectively synthesize geometrically consistent multi-views and also achieve high performance on benchmarks without full fine-tuning of T2I models. The code and data are publicly available in ~\href{https://postech-cvlab.github.io/nvsadapter/}{https://postech-cvlab.github.io/nvsadapter/}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.07315v2-abstract-full').style.display = 'none'; document.getElementById('2312.07315v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">[ECCV2024] Project Page: https://postech-cvlab.github.io/nvsadapter/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.04594">arXiv:2312.04594</a> <span> [<a href="https://arxiv.org/pdf/2312.04594">pdf</a>, <a href="https://arxiv.org/format/2312.04594">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> FedGeo: Privacy-Preserving User Next Location Prediction with Federated Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Park%2C+C">Chung Park</a>, <a href="/search/cs?searchtype=author&query=Choi%2C+T">Taekyoon Choi</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+T">Taesan Kim</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+M">Mincheol Cho</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+J">Junui Hong</a>, <a href="/search/cs?searchtype=author&query=Choi%2C+M">Minsung Choi</a>, <a href="/search/cs?searchtype=author&query=Choo%2C+J">Jaegul Choo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.04594v1-abstract-short" style="display: inline;"> A User Next Location Prediction (UNLP) task, which predicts the next location that a user will move to given his/her trajectory, is an indispensable task for a wide range of applications. Previous studies using large-scale trajectory datasets in a single server have achieved remarkable performance in UNLP task. However, in real-world applications, legal and ethical issues have been raised regardin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.04594v1-abstract-full').style.display = 'inline'; document.getElementById('2312.04594v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.04594v1-abstract-full" style="display: none;"> A User Next Location Prediction (UNLP) task, which predicts the next location that a user will move to given his/her trajectory, is an indispensable task for a wide range of applications. Previous studies using large-scale trajectory datasets in a single server have achieved remarkable performance in UNLP task. However, in real-world applications, legal and ethical issues have been raised regarding privacy concerns leading to restrictions against sharing human trajectory datasets to any other server. In response, Federated Learning (FL) has emerged to address the personal privacy issue by collaboratively training multiple clients (i.e., users) and then aggregating them. While previous studies employed FL for UNLP, they are still unable to achieve reliable performance because of the heterogeneity of clients' mobility. To tackle this problem, we propose the Federated Learning for Geographic Information (FedGeo), a FL framework specialized for UNLP, which alleviates the heterogeneity of clients' mobility and guarantees personal privacy protection. Firstly, we incorporate prior global geographic adjacency information to the local client model, since the spatial correlation between locations is trained partially in each client who has only a heterogeneous subset of the overall trajectories in FL. We also introduce a novel aggregation method that minimizes the gap between client models to solve the problem of client drift caused by differences between client models when learning with their heterogeneous data. Lastly, we probabilistically exclude clients with extremely heterogeneous data from the FL process by focusing on clients who visit relatively diverse locations. We show that FedGeo is superior to other FL methods for model performance in UNLP task. We also validated our model in a real-world application using our own customers' mobile phones and the FL agent system. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.04594v1-abstract-full').style.display = 'none'; document.getElementById('2312.04594v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at 31st ACM SIGSPATIAL International Conference on Advances in Geographic Information Systems (ACM SIGSPATIAL 2023)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.04266">arXiv:2312.04266</a> <span> [<a href="https://arxiv.org/pdf/2312.04266">pdf</a>, <a href="https://arxiv.org/format/2312.04266">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Activity Grammars for Temporal Action Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gong%2C+D">Dayoung Gong</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+J">Joonseok Lee</a>, <a href="/search/cs?searchtype=author&query=Jung%2C+D">Deunsol Jung</a>, <a href="/search/cs?searchtype=author&query=Kwak%2C+S">Suha Kwak</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+M">Minsu Cho</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.04266v1-abstract-short" style="display: inline;"> Sequence prediction on temporal data requires the ability to understand compositional structures of multi-level semantics beyond individual and contextual properties. The task of temporal action segmentation, which aims at translating an untrimmed activity video into a sequence of action segments, remains challenging for this reason. This paper addresses the problem by introducing an effective act… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.04266v1-abstract-full').style.display = 'inline'; document.getElementById('2312.04266v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.04266v1-abstract-full" style="display: none;"> Sequence prediction on temporal data requires the ability to understand compositional structures of multi-level semantics beyond individual and contextual properties. The task of temporal action segmentation, which aims at translating an untrimmed activity video into a sequence of action segments, remains challenging for this reason. This paper addresses the problem by introducing an effective activity grammar to guide neural predictions for temporal action segmentation. We propose a novel grammar induction algorithm that extracts a powerful context-free grammar from action sequence data. We also develop an efficient generalized parser that transforms frame-level probability distributions into a reliable sequence of actions according to the induced grammar with recursive rules. Our approach can be combined with any neural network for temporal action segmentation to enhance the sequence prediction and discover its compositional structure. Experimental results demonstrate that our method significantly improves temporal action segmentation in terms of both performance and interpretability on two standard benchmarks, Breakfast and 50 Salads. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.04266v1-abstract-full').style.display = 'none'; document.getElementById('2312.04266v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to NeurIPS 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.02878">arXiv:2312.02878</a> <span> [<a href="https://arxiv.org/pdf/2312.02878">pdf</a>, <a href="https://arxiv.org/format/2312.02878">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Towards More Practical Group Activity Detection: A New Benchmark and Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kim%2C+D">Dongkeun Kim</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Y">Youngkil Song</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+M">Minsu Cho</a>, <a href="/search/cs?searchtype=author&query=Kwak%2C+S">Suha Kwak</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.02878v2-abstract-short" style="display: inline;"> Group activity detection (GAD) is the task of identifying members of each group and classifying the activity of the group at the same time in a video. While GAD has been studied recently, there is still much room for improvement in both dataset and methodology due to their limited capability to address practical GAD scenarios. To resolve these issues, we first present a new dataset, dubbed Caf茅. U… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.02878v2-abstract-full').style.display = 'inline'; document.getElementById('2312.02878v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.02878v2-abstract-full" style="display: none;"> Group activity detection (GAD) is the task of identifying members of each group and classifying the activity of the group at the same time in a video. While GAD has been studied recently, there is still much room for improvement in both dataset and methodology due to their limited capability to address practical GAD scenarios. To resolve these issues, we first present a new dataset, dubbed Caf茅. Unlike existing datasets, Caf茅 is constructed primarily for GAD and presents more practical scenarios and metrics, as well as being large-scale and providing rich annotations. Along with the dataset, we propose a new GAD model that deals with an unknown number of groups and latent group members efficiently and effectively. We evaluated our model on three datasets including Caf茅, where it outperformed previous work in terms of both accuracy and inference speed. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.02878v2-abstract-full').style.display = 'none'; document.getElementById('2312.02878v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ECCV 2024, Project page: https://cvlab.postech.ac.kr/research/CAFE</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.01133">arXiv:2312.01133</a> <span> [<a href="https://arxiv.org/pdf/2312.01133">pdf</a>, <a href="https://arxiv.org/format/2312.01133">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> $t^3$-Variational Autoencoder: Learning Heavy-tailed Data with Student's t and Power Divergence </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kim%2C+J">Juno Kim</a>, <a href="/search/cs?searchtype=author&query=Kwon%2C+J">Jaehyuk Kwon</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+M">Mincheol Cho</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+H">Hyunjong Lee</a>, <a href="/search/cs?searchtype=author&query=Won%2C+J">Joong-Ho Won</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.01133v2-abstract-short" style="display: inline;"> The variational autoencoder (VAE) typically employs a standard normal prior as a regularizer for the probabilistic latent encoder. However, the Gaussian tail often decays too quickly to effectively accommodate the encoded points, failing to preserve crucial structures hidden in the data. In this paper, we explore the use of heavy-tailed models to combat over-regularization. Drawing upon insights f… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.01133v2-abstract-full').style.display = 'inline'; document.getElementById('2312.01133v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.01133v2-abstract-full" style="display: none;"> The variational autoencoder (VAE) typically employs a standard normal prior as a regularizer for the probabilistic latent encoder. However, the Gaussian tail often decays too quickly to effectively accommodate the encoded points, failing to preserve crucial structures hidden in the data. In this paper, we explore the use of heavy-tailed models to combat over-regularization. Drawing upon insights from information geometry, we propose $t^3$VAE, a modified VAE framework that incorporates Student's t-distributions for the prior, encoder, and decoder. This results in a joint model distribution of a power form which we argue can better fit real-world datasets. We derive a new objective by reformulating the evidence lower bound as joint optimization of KL divergence between two statistical manifolds and replacing with $纬$-power divergence, a natural alternative for power families. $t^3$VAE demonstrates superior generation of low-density regions when trained on heavy-tailed synthetic data. Furthermore, we show that $t^3$VAE significantly outperforms other models on CelebA and imbalanced CIFAR-100 datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.01133v2-abstract-full').style.display = 'none'; document.getElementById('2312.01133v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICLR 2024; 27 pages, 7 figures, 8 tables</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Cho%2C+M&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Cho%2C+M&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Cho%2C+M&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Cho%2C+M&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Cho%2C+M&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Cho%2C+M&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>