Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 2,188 results for author: <span class="mathjax">Kim, S</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Kim%2C+S">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Kim, S"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Kim%2C+S&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Kim, S"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Kim%2C+S&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Kim%2C+S&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Kim%2C+S&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Kim%2C+S&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Kim%2C+S&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Kim%2C+S&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14384">arXiv:2411.14384</a> <span> [<a href="https://arxiv.org/pdf/2411.14384">pdf</a>, <a href="https://arxiv.org/format/2411.14384">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> Baking Gaussian Splatting into Diffusion Denoiser for Fast and Scalable Single-stage Image-to-3D Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cai%2C+Y">Yuanhao Cai</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">He Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kai Zhang</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+Y">Yixun Liang</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+M">Mengwei Ren</a>, <a href="/search/cs?searchtype=author&query=Luan%2C+F">Fujun Luan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qing Liu</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+S+Y">Soo Ye Kim</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jianming Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhifei Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yuqian Zhou</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Z">Zhe Lin</a>, <a href="/search/cs?searchtype=author&query=Yuille%2C+A">Alan Yuille</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14384v1-abstract-short" style="display: inline;"> Existing feed-forward image-to-3D methods mainly rely on 2D multi-view diffusion models that cannot guarantee 3D consistency. These methods easily collapse when changing the prompt view direction and mainly handle object-centric prompt images. In this paper, we propose a novel single-stage 3D diffusion model, DiffusionGS, for object and scene generation from a single view. DiffusionGS directly out… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14384v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14384v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14384v1-abstract-full" style="display: none;"> Existing feed-forward image-to-3D methods mainly rely on 2D multi-view diffusion models that cannot guarantee 3D consistency. These methods easily collapse when changing the prompt view direction and mainly handle object-centric prompt images. In this paper, we propose a novel single-stage 3D diffusion model, DiffusionGS, for object and scene generation from a single view. DiffusionGS directly outputs 3D Gaussian point clouds at each timestep to enforce view consistency and allow the model to generate robustly given prompt views of any directions, beyond object-centric inputs. Plus, to improve the capability and generalization ability of DiffusionGS, we scale up 3D training data by developing a scene-object mixed training strategy. Experiments show that our method enjoys better generation quality (2.20 dB higher in PSNR and 23.25 lower in FID) and over 5x faster speed (~6s on an A100 GPU) than SOTA methods. The user study and text-to-3D applications also reveals the practical values of our method. Our Project page at https://caiyuanhao1998.github.io/project/DiffusionGS/ shows the video and interactive generation results. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14384v1-abstract-full').style.display = 'none'; document.getElementById('2411.14384v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">A novel one-stage 3DGS-based diffusion generates objects and scenes from a single view in ~6 seconds</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13239">arXiv:2411.13239</a> <span> [<a href="https://arxiv.org/pdf/2411.13239">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Emerging Technologies">cs.ET</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> </div> <p class="title is-5 mathjax"> Transforming the Hybrid Cloud for Emerging AI Workloads </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+D">Deming Chen</a>, <a href="/search/cs?searchtype=author&query=Youssef%2C+A">Alaa Youssef</a>, <a href="/search/cs?searchtype=author&query=Pendse%2C+R">Ruchi Pendse</a>, <a href="/search/cs?searchtype=author&query=Schleife%2C+A">Andr茅 Schleife</a>, <a href="/search/cs?searchtype=author&query=Clark%2C+B+K">Bryan K. Clark</a>, <a href="/search/cs?searchtype=author&query=Hamann%2C+H">Hendrik Hamann</a>, <a href="/search/cs?searchtype=author&query=He%2C+J">Jingrui He</a>, <a href="/search/cs?searchtype=author&query=Laino%2C+T">Teodoro Laino</a>, <a href="/search/cs?searchtype=author&query=Varshney%2C+L">Lav Varshney</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yuxiong Wang</a>, <a href="/search/cs?searchtype=author&query=Sil%2C+A">Avirup Sil</a>, <a href="/search/cs?searchtype=author&query=Jabbarvand%2C+R">Reyhaneh Jabbarvand</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+T">Tianyin Xu</a>, <a href="/search/cs?searchtype=author&query=Kindratenko%2C+V">Volodymyr Kindratenko</a>, <a href="/search/cs?searchtype=author&query=Costa%2C+C">Carlos Costa</a>, <a href="/search/cs?searchtype=author&query=Adve%2C+S">Sarita Adve</a>, <a href="/search/cs?searchtype=author&query=Mendis%2C+C">Charith Mendis</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Minjia Zhang</a>, <a href="/search/cs?searchtype=author&query=N%C3%BA%C3%B1ez-Corrales%2C+S">Santiago N煤帽ez-Corrales</a>, <a href="/search/cs?searchtype=author&query=Ganti%2C+R">Raghu Ganti</a>, <a href="/search/cs?searchtype=author&query=Srivatsa%2C+M">Mudhakar Srivatsa</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+N+S">Nam Sung Kim</a>, <a href="/search/cs?searchtype=author&query=Torrellas%2C+J">Josep Torrellas</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jian Huang</a>, <a href="/search/cs?searchtype=author&query=Seelam%2C+S">Seetharami Seelam</a> , et al. (19 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13239v1-abstract-short" style="display: inline;"> This white paper, developed through close collaboration between IBM Research and UIUC researchers within the IIDAI Institute, envisions transforming hybrid cloud systems to meet the growing complexity of AI workloads through innovative, full-stack co-design approaches, emphasizing usability, manageability, affordability, adaptability, efficiency, and scalability. By integrating cutting-edge techno… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13239v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13239v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13239v1-abstract-full" style="display: none;"> This white paper, developed through close collaboration between IBM Research and UIUC researchers within the IIDAI Institute, envisions transforming hybrid cloud systems to meet the growing complexity of AI workloads through innovative, full-stack co-design approaches, emphasizing usability, manageability, affordability, adaptability, efficiency, and scalability. By integrating cutting-edge technologies such as generative and agentic AI, cross-layer automation and optimization, unified control plane, and composable and adaptive system architecture, the proposed framework addresses critical challenges in energy efficiency, performance, and cost-effectiveness. Incorporating quantum computing as it matures will enable quantum-accelerated simulations for materials science, climate modeling, and other high-impact domains. Collaborative efforts between academia and industry are central to this vision, driving advancements in foundation models for material design and climate solutions, scalable multimodal data processing, and enhanced physics-based AI emulators for applications like weather forecasting and carbon sequestration. Research priorities include advancing AI agentic systems, LLM as an Abstraction (LLMaaA), AI model optimization and unified abstractions across heterogeneous infrastructure, end-to-end edge-cloud transformation, efficient programming model, middleware and platform, secure infrastructure, application-adaptive cloud systems, and new quantum-classical collaborative workflows. These ideas and solutions encompass both theoretical and practical research questions, requiring coordinated input and support from the research community. This joint initiative aims to establish hybrid clouds as secure, efficient, and sustainable platforms, fostering breakthroughs in AI-driven applications and scientific discovery across academia, industry, and society. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13239v1-abstract-full').style.display = 'none'; document.getElementById('2411.13239v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">70 pages, 27 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12878">arXiv:2411.12878</a> <span> [<a href="https://arxiv.org/pdf/2411.12878">pdf</a>, <a href="https://arxiv.org/format/2411.12878">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Local Anti-Concentration Class: Logarithmic Regret for Greedy Linear Contextual Bandit </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kim%2C+S">Seok-Jin Kim</a>, <a href="/search/cs?searchtype=author&query=Oh%2C+M">Min-hwan Oh</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12878v1-abstract-short" style="display: inline;"> We study the performance guarantees of exploration-free greedy algorithms for the linear contextual bandit problem. We introduce a novel condition, named the \textit{Local Anti-Concentration} (LAC) condition, which enables a greedy bandit algorithm to achieve provable efficiency. We show that the LAC condition is satisfied by a broad class of distributions, including Gaussian, exponential, uniform… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12878v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12878v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12878v1-abstract-full" style="display: none;"> We study the performance guarantees of exploration-free greedy algorithms for the linear contextual bandit problem. We introduce a novel condition, named the \textit{Local Anti-Concentration} (LAC) condition, which enables a greedy bandit algorithm to achieve provable efficiency. We show that the LAC condition is satisfied by a broad class of distributions, including Gaussian, exponential, uniform, Cauchy, and Student's~$t$ distributions, along with other exponential family distributions and their truncated variants. This significantly expands the class of distributions under which greedy algorithms can perform efficiently. Under our proposed LAC condition, we prove that the cumulative expected regret of the greedy algorithm for the linear contextual bandit is bounded by $O(\operatorname{poly} \log T)$. Our results establish the widest range of distributions known to date that allow a sublinear regret bound for greedy algorithms, further achieving a sharp poly-logarithmic regret. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12878v1-abstract-full').style.display = 'none'; document.getElementById('2411.12878v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12525">arXiv:2411.12525</a> <span> [<a href="https://arxiv.org/pdf/2411.12525">pdf</a>, <a href="https://arxiv.org/format/2411.12525">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Rethinking Top Probability from Multi-view for Distracted Driver Behaviour Localization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Nguyen%2C+Q+V">Quang Vinh Nguyen</a>, <a href="/search/cs?searchtype=author&query=Son%2C+V+H+T">Vo Hoang Thanh Son</a>, <a href="/search/cs?searchtype=author&query=Hoang%2C+C+T+V">Chau Truong Vinh Hoang</a>, <a href="/search/cs?searchtype=author&query=Nguyen%2C+D+D">Duc Duy Nguyen</a>, <a href="/search/cs?searchtype=author&query=Minh%2C+N+H+N">Nhat Huy Nguyen Minh</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+S">Soo-Hyung Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12525v1-abstract-short" style="display: inline;"> Naturalistic driving action localization task aims to recognize and comprehend human behaviors and actions from video data captured during real-world driving scenarios. Previous studies have shown great action localization performance by applying a recognition model followed by probability-based post-processing. Nevertheless, the probabilities provided by the recognition model frequently contain c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12525v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12525v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12525v1-abstract-full" style="display: none;"> Naturalistic driving action localization task aims to recognize and comprehend human behaviors and actions from video data captured during real-world driving scenarios. Previous studies have shown great action localization performance by applying a recognition model followed by probability-based post-processing. Nevertheless, the probabilities provided by the recognition model frequently contain confused information causing challenge for post-processing. In this work, we adopt an action recognition model based on self-supervise learning to detect distracted activities and give potential action probabilities. Subsequently, a constraint ensemble strategy takes advantages of multi-camera views to provide robust predictions. Finally, we introduce a conditional post-processing operation to locate distracted behaviours and action temporal boundaries precisely. Experimenting on test set A2, our method obtains the sixth position on the public leaderboard of track 3 of the 2024 AI City Challenge. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12525v1-abstract-full').style.display = 'none'; document.getElementById('2411.12525v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Computer Vision and Pattern Recognition Workshop 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11323">arXiv:2411.11323</a> <span> [<a href="https://arxiv.org/pdf/2411.11323">pdf</a>, <a href="https://arxiv.org/format/2411.11323">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> SayComply: Grounding Field Robotic Tasks in Operational Compliance through Retrieval-Based Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ginting%2C+M+F">Muhammad Fadhil Ginting</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+D">Dong-Ki Kim</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+S">Sung-Kyun Kim</a>, <a href="/search/cs?searchtype=author&query=Krishna%2C+B+J">Bandi Jai Krishna</a>, <a href="/search/cs?searchtype=author&query=Kochenderfer%2C+M+J">Mykel J. Kochenderfer</a>, <a href="/search/cs?searchtype=author&query=Omidshafiei%2C+S">Shayegan Omidshafiei</a>, <a href="/search/cs?searchtype=author&query=Agha-mohammadi%2C+A">Ali-akbar Agha-mohammadi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11323v1-abstract-short" style="display: inline;"> This paper addresses the problem of task planning for robots that must comply with operational manuals in real-world settings. Task planning under these constraints is essential for enabling autonomous robot operation in domains that require adherence to domain-specific knowledge. Current methods for generating robot goals and plans rely on common sense knowledge encoded in large language models.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11323v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11323v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11323v1-abstract-full" style="display: none;"> This paper addresses the problem of task planning for robots that must comply with operational manuals in real-world settings. Task planning under these constraints is essential for enabling autonomous robot operation in domains that require adherence to domain-specific knowledge. Current methods for generating robot goals and plans rely on common sense knowledge encoded in large language models. However, these models lack grounding of robot plans to domain-specific knowledge and are not easily transferable between multiple sites or customers with different compliance needs. In this work, we present SayComply, which enables grounding robotic task planning with operational compliance using retrieval-based language models. We design a hierarchical database of operational, environment, and robot embodiment manuals and procedures to enable efficient retrieval of the relevant context under the limited context length of the LLMs. We then design a task planner using a tree-based retrieval augmented generation (RAG) technique to generate robot tasks that follow user instructions while simultaneously complying with the domain knowledge in the database. We demonstrate the benefits of our approach through simulations and hardware experiments in real-world scenarios that require precise context retrieval across various types of context, outperforming the standard RAG method. Our approach bridges the gap in deploying robots that consistently adhere to operational protocols, offering a scalable and edge-deployable solution for ensuring compliance across varied and complex real-world environments. Project website: saycomply.github.io. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11323v1-abstract-full').style.display = 'none'; document.getElementById('2411.11323v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10761">arXiv:2411.10761</a> <span> [<a href="https://arxiv.org/pdf/2411.10761">pdf</a>, <a href="https://arxiv.org/format/2411.10761">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Can Generic LLMs Help Analyze Child-adult Interactions Involving Children with Autism in Clinical Observation? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Feng%2C+T">Tiantian Feng</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+A">Anfeng Xu</a>, <a href="/search/cs?searchtype=author&query=Lahiri%2C+R">Rimita Lahiri</a>, <a href="/search/cs?searchtype=author&query=Tager-Flusberg%2C+H">Helen Tager-Flusberg</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+S+H">So Hyun Kim</a>, <a href="/search/cs?searchtype=author&query=Bishop%2C+S">Somer Bishop</a>, <a href="/search/cs?searchtype=author&query=Lord%2C+C">Catherine Lord</a>, <a href="/search/cs?searchtype=author&query=Narayanan%2C+S">Shrikanth Narayanan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10761v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) have shown significant potential in understanding human communication and interaction. However, their performance in the domain of child-inclusive interactions, including in clinical settings, remains less explored. In this work, we evaluate generic LLMs' ability to analyze child-adult dyadic interactions in a clinically relevant context involving children with ASD. Sp… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10761v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10761v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10761v1-abstract-full" style="display: none;"> Large Language Models (LLMs) have shown significant potential in understanding human communication and interaction. However, their performance in the domain of child-inclusive interactions, including in clinical settings, remains less explored. In this work, we evaluate generic LLMs' ability to analyze child-adult dyadic interactions in a clinically relevant context involving children with ASD. Specifically, we explore LLMs in performing four tasks: classifying child-adult utterances, predicting engaged activities, recognizing language skills and understanding traits that are clinically relevant. Our evaluation shows that generic LLMs are highly capable of analyzing long and complex conversations in clinical observation sessions, often surpassing the performance of non-expert human evaluators. The results show their potential to segment interactions of interest, assist in language skills evaluation, identify engaged activities, and offer clinical-relevant context for assessments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10761v1-abstract-full').style.display = 'none'; document.getElementById('2411.10761v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">GenAI for Health Workshop, NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10715">arXiv:2411.10715</a> <span> [<a href="https://arxiv.org/pdf/2411.10715">pdf</a>, <a href="https://arxiv.org/format/2411.10715">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> EVT: Efficient View Transformation for Multi-Modal 3D Object Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lee%2C+Y">Yongjin Lee</a>, <a href="/search/cs?searchtype=author&query=Jeong%2C+H">Hyeon-Mun Jeong</a>, <a href="/search/cs?searchtype=author&query=Jeon%2C+Y">Yurim Jeon</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+S">Sanghyun Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10715v2-abstract-short" style="display: inline;"> Multi-modal sensor fusion in bird's-eye-view (BEV) representation has become the leading approach in 3D object detection. However, existing methods often rely on depth estimators or transformer encoders for view transformation, incurring substantial computational overhead. Furthermore, the lack of precise geometric correspondence between 2D and 3D spaces leads to spatial and ray-directional misali… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10715v2-abstract-full').style.display = 'inline'; document.getElementById('2411.10715v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10715v2-abstract-full" style="display: none;"> Multi-modal sensor fusion in bird's-eye-view (BEV) representation has become the leading approach in 3D object detection. However, existing methods often rely on depth estimators or transformer encoders for view transformation, incurring substantial computational overhead. Furthermore, the lack of precise geometric correspondence between 2D and 3D spaces leads to spatial and ray-directional misalignments, restricting the effectiveness of BEV representations. To address these challenges, we propose a novel 3D object detector via efficient view transformation (EVT), which leverages a well-structured BEV representation to enhance accuracy and efficiency. EVT focuses on two main areas. First, it employs Adaptive Sampling and Adaptive Projection (ASAP), using LiDAR guidance to generate 3D sampling points and adaptive kernels. The generated points and kernels are then used to facilitate the transformation of image features into BEV space and refine the BEV features. Second, EVT includes an improved transformer-based detection framework, which contains a group-wise query initialization method and an enhanced query update framework. It is designed to effectively utilize the obtained multi-modal BEV features within the transformer decoder. By leveraging the geometric properties of object queries, this framework significantly enhances detection performance, especially in a multi-layer transformer decoder structure. EVT achieves state-of-the-art performance on the nuScenes test set with real-time inference speed. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10715v2-abstract-full').style.display = 'none'; document.getElementById('2411.10715v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10450">arXiv:2411.10450</a> <span> [<a href="https://arxiv.org/pdf/2411.10450">pdf</a>, <a href="https://arxiv.org/format/2411.10450">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Dataset Refinement for Improving the Generalization Ability of the EEG Decoding Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kim%2C+S">Sung-Jin Kim</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+D">Dae-Hyeok Lee</a>, <a href="/search/cs?searchtype=author&query=Han%2C+H">Hyeon-Taek Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10450v1-abstract-short" style="display: inline;"> Electroencephalography (EEG) is a generally used neuroimaging approach in brain-computer interfaces due to its non-invasive characteristics and convenience, making it an effective tool for understanding human intentions. Therefore, recent research has focused on decoding human intentions from EEG signals utilizing deep learning methods. However, since EEG signals are highly susceptible to noise du… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10450v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10450v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10450v1-abstract-full" style="display: none;"> Electroencephalography (EEG) is a generally used neuroimaging approach in brain-computer interfaces due to its non-invasive characteristics and convenience, making it an effective tool for understanding human intentions. Therefore, recent research has focused on decoding human intentions from EEG signals utilizing deep learning methods. However, since EEG signals are highly susceptible to noise during acquisition, there is a high possibility of the existence of noisy data in the dataset. Although pioneer studies have generally assumed that the dataset is well-curated, this assumption is not always met in the EEG dataset. In this paper, we addressed this issue by designing a dataset refinement algorithm that can eliminate noisy data based on metrics evaluating data influence during the training process. We applied the proposed algorithm to two motor imagery EEG public datasets and three different models to perform dataset refinement. The results indicated that retraining the model with the refined dataset consistently led to better generalization performance compared to using the original dataset. Hence, we demonstrated that removing noisy data from the training dataset alone can effectively improve the generalization performance of deep learning models in the EEG domain. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10450v1-abstract-full').style.display = 'none'; document.getElementById('2411.10450v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">4 pages, 1 figure, conference</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10322">arXiv:2411.10322</a> <span> [<a href="https://arxiv.org/pdf/2411.10322">pdf</a>, <a href="https://arxiv.org/format/2411.10322">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Melanoma Detection with Uncertainty Quantification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kim%2C+S">SangHyuk Kim</a>, <a href="/search/cs?searchtype=author&query=Gaibor%2C+E">Edward Gaibor</a>, <a href="/search/cs?searchtype=author&query=Matejek%2C+B">Brian Matejek</a>, <a href="/search/cs?searchtype=author&query=Haehn%2C+D">Daniel Haehn</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10322v1-abstract-short" style="display: inline;"> Early detection of melanoma is crucial for improving survival rates. Current detection tools often utilize data-driven machine learning methods but often overlook the full integration of multiple datasets. We combine publicly available datasets to enhance data diversity, allowing numerous experiments to train and evaluate various classifiers. We then calibrate them to minimize misdiagnoses by inco… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10322v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10322v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10322v1-abstract-full" style="display: none;"> Early detection of melanoma is crucial for improving survival rates. Current detection tools often utilize data-driven machine learning methods but often overlook the full integration of multiple datasets. We combine publicly available datasets to enhance data diversity, allowing numerous experiments to train and evaluate various classifiers. We then calibrate them to minimize misdiagnoses by incorporating uncertainty quantification. Our experiments on benchmark datasets show accuracies of up to 93.2% before and 97.8% after applying uncertainty-based rejection, leading to a reduction in misdiagnoses by over 40.5%. Our code and data are publicly available, and a web-based interface for quick melanoma detection of user-supplied images is also provided. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10322v1-abstract-full').style.display = 'none'; document.getElementById('2411.10322v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 5 figures, 3 tables, submitted to ISBI2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09707">arXiv:2411.09707</a> <span> [<a href="https://arxiv.org/pdf/2411.09707">pdf</a>, <a href="https://arxiv.org/format/2411.09707">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Decoding Fatigue Levels of Pilots Using EEG Signals with Hybrid Deep Neural Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lee%2C+D">Dae-Hyeok Lee</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+S">Sung-Jin Kim</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+S">Si-Hyun Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09707v1-abstract-short" style="display: inline;"> The detection of pilots' mental states is critical, as abnormal mental states have the potential to cause catastrophic accidents. This study demonstrates the feasibility of using deep learning techniques to classify different fatigue levels, specifically a normal state, low fatigue, and high fatigue. To the best of our knowledge, this is the first study to classify fatigue levels in pilots. Our ap… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09707v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09707v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09707v1-abstract-full" style="display: none;"> The detection of pilots' mental states is critical, as abnormal mental states have the potential to cause catastrophic accidents. This study demonstrates the feasibility of using deep learning techniques to classify different fatigue levels, specifically a normal state, low fatigue, and high fatigue. To the best of our knowledge, this is the first study to classify fatigue levels in pilots. Our approach employs the hybrid deep neural network comprising five convolutional blocks and one long short-term memory block to extract the significant features from electroencephalography signals. Ten pilots participated in the experiment, which was conducted in a simulated flight environment. Compared to four conventional models, our proposed model achieved a superior grand-average accuracy of 0.8801, outperforming other models by at least 0.0599 in classifying fatigue levels. In addition to successfully classifying fatigue levels, our model provided valuable feedback to subjects. Therefore, we anticipate that our study will make the significant contributions to the advancement of autonomous flight and driving technologies, leveraging artificial intelligence in the future. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09707v1-abstract-full').style.display = 'none'; document.getElementById('2411.09707v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">4 pages, 3 figures, 1 table, Name of Conference: International Winter Conference on Brain-Computer Interface</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09688">arXiv:2411.09688</a> <span> [<a href="https://arxiv.org/pdf/2411.09688">pdf</a>, <a href="https://arxiv.org/format/2411.09688">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Squeezed Attention: Accelerating Long Context Length LLM Inference </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hooper%2C+C">Coleman Hooper</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+S">Sehoon Kim</a>, <a href="/search/cs?searchtype=author&query=Mohammadzadeh%2C+H">Hiva Mohammadzadeh</a>, <a href="/search/cs?searchtype=author&query=Maheswaran%2C+M">Monishwaran Maheswaran</a>, <a href="/search/cs?searchtype=author&query=Paik%2C+J">June Paik</a>, <a href="/search/cs?searchtype=author&query=Mahoney%2C+M+W">Michael W. Mahoney</a>, <a href="/search/cs?searchtype=author&query=Keutzer%2C+K">Kurt Keutzer</a>, <a href="/search/cs?searchtype=author&query=Gholami%2C+A">Amir Gholami</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09688v1-abstract-short" style="display: inline;"> Emerging Large Language Model (LLM) applications require long input prompts to perform complex downstream tasks like document analysis and code generation. For these long context length applications, the length of the input prompt poses a significant challenge in terms of inference efficiency since the inference costs increase linearly with sequence length. However, for many of these applications,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09688v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09688v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09688v1-abstract-full" style="display: none;"> Emerging Large Language Model (LLM) applications require long input prompts to perform complex downstream tasks like document analysis and code generation. For these long context length applications, the length of the input prompt poses a significant challenge in terms of inference efficiency since the inference costs increase linearly with sequence length. However, for many of these applications, much of the context in the prompt is fixed across different user inputs, thereby providing the opportunity to perform offline optimizations to process user inputs quickly, as they are received. In this work, we propose Squeezed Attention as a mechanism to accelerate LLM applications where a large portion of the input prompt is fixed. We first leverage K-means clustering offline to group the keys for the fixed context based on semantic similarity and represent each cluster with a single centroid value. During inference, we compare query tokens from the user input with the centroids to predict which of the keys from the fixed context are semantically relevant and need to be loaded during inference. We then compute exact attention using only these important keys from the fixed context, thereby reducing bandwidth and computational costs. We also extend our method to use a hierarchical centroid lookup to identify important keys, which can reduce the complexity of attention from linear to logarithmic with respect to the context length. We implement optimized Triton kernels for centroid comparison and sparse FlashAttention with important keys, achieving more than 4x speedups during both the prefill and generation phases for long-context inference. Furthermore, we have extensively evaluated our method on various long-context benchmarks including LongBench, where it achieves a 3x reduction in KV cache budget without accuracy loss and up to an 8x reduction with <0.5 point accuracy gap for various models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09688v1-abstract-full').style.display = 'none'; document.getElementById('2411.09688v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09302">arXiv:2411.09302</a> <span> [<a href="https://arxiv.org/pdf/2411.09302">pdf</a>, <a href="https://arxiv.org/format/2411.09302">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> EEG-Based Speech Decoding: A Novel Approach Using Multi-Kernel Ensemble Diffusion Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kim%2C+S">Soowon Kim</a>, <a href="/search/cs?searchtype=author&query=Jo%2C+H">Ha-Na Jo</a>, <a href="/search/cs?searchtype=author&query=Ko%2C+E">Eunyeong Ko</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09302v1-abstract-short" style="display: inline;"> In this study, we propose an ensemble learning framework for electroencephalogram-based overt speech classification, leveraging denoising diffusion probabilistic models with varying convolutional kernel sizes. The ensemble comprises three models with kernel sizes of 51, 101, and 201, effectively capturing multi-scale temporal features inherent in signals. This approach improves the robustness and… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09302v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09302v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09302v1-abstract-full" style="display: none;"> In this study, we propose an ensemble learning framework for electroencephalogram-based overt speech classification, leveraging denoising diffusion probabilistic models with varying convolutional kernel sizes. The ensemble comprises three models with kernel sizes of 51, 101, and 201, effectively capturing multi-scale temporal features inherent in signals. This approach improves the robustness and accuracy of speech decoding by accommodating the rich temporal complexity of neural signals. The ensemble models work in conjunction with conditional autoencoders that refine the reconstructed signals and maximize the useful information for downstream classification tasks. The results indicate that the proposed ensemble-based approach significantly outperforms individual models and existing state-of-the-art techniques. These findings demonstrate the potential of ensemble methods in advancing brain signal decoding, offering new possibilities for non-verbal communication applications, particularly in brain-computer interface systems aimed at aiding individuals with speech impairments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09302v1-abstract-full').style.display = 'none'; document.getElementById('2411.09302v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09211">arXiv:2411.09211</a> <span> [<a href="https://arxiv.org/pdf/2411.09211">pdf</a>, <a href="https://arxiv.org/format/2411.09211">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Dynamic Neural Communication: Convergence of Computer Vision and Brain-Computer Interface </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Park%2C+J">Ji-Ha Park</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+S">Seo-Hyun Lee</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+S">Soowon Kim</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+S">Seong-Whan Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09211v1-abstract-short" style="display: inline;"> Interpreting human neural signals to decode static speech intentions such as text or images and dynamic speech intentions such as audio or video is showing great potential as an innovative communication tool. Human communication accompanies various features, such as articulatory movements, facial expressions, and internal speech, all of which are reflected in neural signals. However, most studies… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09211v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09211v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09211v1-abstract-full" style="display: none;"> Interpreting human neural signals to decode static speech intentions such as text or images and dynamic speech intentions such as audio or video is showing great potential as an innovative communication tool. Human communication accompanies various features, such as articulatory movements, facial expressions, and internal speech, all of which are reflected in neural signals. However, most studies only generate short or fragmented outputs, while providing informative communication by leveraging various features from neural signals remains challenging. In this study, we introduce a dynamic neural communication method that leverages current computer vision and brain-computer interface technologies. Our approach captures the user's intentions from neural signals and decodes visemes in short time steps to produce dynamic visual outputs. The results demonstrate the potential to rapidly capture and reconstruct lip movements during natural speech attempts from human neural signals, enabling dynamic neural communication through the convergence of computer vision and brain--computer interface. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09211v1-abstract-full').style.display = 'none'; document.getElementById('2411.09211v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">4 pages, 2 figures, 1 table, Name of Conference: International Conference on Brain-Computer Interface</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.08933">arXiv:2411.08933</a> <span> [<a href="https://arxiv.org/pdf/2411.08933">pdf</a>, <a href="https://arxiv.org/format/2411.08933">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Confidence-aware Denoised Fine-tuning of Off-the-shelf Models for Certified Robustness </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jang%2C+S">Suhyeok Jang</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+S">Seojin Kim</a>, <a href="/search/cs?searchtype=author&query=Shin%2C+J">Jinwoo Shin</a>, <a href="/search/cs?searchtype=author&query=Jeong%2C+J">Jongheon Jeong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.08933v2-abstract-short" style="display: inline;"> The remarkable advances in deep learning have led to the emergence of many off-the-shelf classifiers, e.g., large pre-trained models. However, since they are typically trained on clean data, they remain vulnerable to adversarial attacks. Despite this vulnerability, their superior performance and transferability make off-the-shelf classifiers still valuable in practice, demanding further work to pr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08933v2-abstract-full').style.display = 'inline'; document.getElementById('2411.08933v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.08933v2-abstract-full" style="display: none;"> The remarkable advances in deep learning have led to the emergence of many off-the-shelf classifiers, e.g., large pre-trained models. However, since they are typically trained on clean data, they remain vulnerable to adversarial attacks. Despite this vulnerability, their superior performance and transferability make off-the-shelf classifiers still valuable in practice, demanding further work to provide adversarial robustness for them in a post-hoc manner. A recently proposed method, denoised smoothing, leverages a denoiser model in front of the classifier to obtain provable robustness without additional training. However, the denoiser often creates hallucination, i.e., images that have lost the semantics of their originally assigned class, leading to a drop in robustness. Furthermore, its noise-and-denoise procedure introduces a significant distribution shift from the original distribution, causing the denoised smoothing framework to achieve sub-optimal robustness. In this paper, we introduce Fine-Tuning with Confidence-Aware Denoised Image Selection (FT-CADIS), a novel fine-tuning scheme to enhance the certified robustness of off-the-shelf classifiers. FT-CADIS is inspired by the observation that the confidence of off-the-shelf classifiers can effectively identify hallucinated images during denoised smoothing. Based on this, we develop a confidence-aware training objective to handle such hallucinated images and improve the stability of fine-tuning from denoised images. In this way, the classifier can be fine-tuned using only images that are beneficial for adversarial robustness. We also find that such a fine-tuning can be done by updating a small fraction of parameters of the classifier. Extensive experiments demonstrate that FT-CADIS has established the state-of-the-art certified robustness among denoised smoothing methods across all $\ell_2$-adversary radius in various benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08933v2-abstract-full').style.display = 'none'; document.getElementById('2411.08933v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">26 pages; TMLR 2024; Code is available at https://github.com/suhyeok24/FT-CADIS</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.08149">arXiv:2411.08149</a> <span> [<a href="https://arxiv.org/pdf/2411.08149">pdf</a>, <a href="https://arxiv.org/format/2411.08149">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computational Engineering, Finance, and Science">cs.CE</span> </div> </div> <p class="title is-5 mathjax"> Design optimization of semiconductor manufacturing equipment using a novel multi-fidelity surrogate modeling approach </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bingran Wang</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+M+S">Min Sung Kim</a>, <a href="/search/cs?searchtype=author&query=Yoon%2C+T">Taewoong Yoon</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+D">Dasom Lee</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+B">Byeong-Sang Kim</a>, <a href="/search/cs?searchtype=author&query=Sung%2C+D">Dougyong Sung</a>, <a href="/search/cs?searchtype=author&query=Hwang%2C+J+T">John T. Hwang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.08149v1-abstract-short" style="display: inline;"> Careful design of semiconductor manufacturing equipment is crucial for ensuring the performance, yield, and reliability of semiconductor devices. Despite this, numerical optimization methods are seldom applied to optimize the design of such equipment due to the difficulty of obtaining accurate simulation models. In this paper, we address a practical and industrially relevant electrostatic chuck (E… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08149v1-abstract-full').style.display = 'inline'; document.getElementById('2411.08149v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.08149v1-abstract-full" style="display: none;"> Careful design of semiconductor manufacturing equipment is crucial for ensuring the performance, yield, and reliability of semiconductor devices. Despite this, numerical optimization methods are seldom applied to optimize the design of such equipment due to the difficulty of obtaining accurate simulation models. In this paper, we address a practical and industrially relevant electrostatic chuck (ESC) design optimization problem by proposing a novel multi-fidelity surrogate modeling approach. The optimization aims to improve the temperature uniformity of the wafer during the etching process by adjusting seven parameters associated with the coolant path and embossing. Our approach combines low-fidelity (LF) and high-fidelity (HF) simulation data to efficiently predict spatial-field quantities, even with a limited number of data points. We use proper orthogonal decomposition (POD) to project the spatially interpolated HF and LF field data onto a shared latent space, followed by the construction of a multi-fidelity kriging model to predict the latent variables of the HF output field. In the ESC design problem, with hundreds or fewer data, our approach achieves a more than 10% reduction in prediction error compared to using kriging models with only HF or LF data. Additionally, in the ESC optimization problem, our proposed method yields better solutions with improvements in all of the quantities of interest, while requiring 20% less data generation cost compared to the HF surrogate modeling approach. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08149v1-abstract-full').style.display = 'none'; document.getElementById('2411.08149v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07729">arXiv:2411.07729</a> <span> [<a href="https://arxiv.org/pdf/2411.07729">pdf</a>, <a href="https://arxiv.org/format/2411.07729">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Exploring the loss landscape of regularized neural networks via convex duality </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kim%2C+S">Sungyoon Kim</a>, <a href="/search/cs?searchtype=author&query=Mishkin%2C+A">Aaron Mishkin</a>, <a href="/search/cs?searchtype=author&query=Pilanci%2C+M">Mert Pilanci</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07729v1-abstract-short" style="display: inline;"> We discuss several aspects of the loss landscape of regularized neural networks: the structure of stationary points, connectivity of optimal solutions, path with nonincreasing loss to arbitrary global optimum, and the nonuniqueness of optimal solutions, by casting the problem into an equivalent convex problem and considering its dual. Starting from two-layer neural networks with scalar output, we… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07729v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07729v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07729v1-abstract-full" style="display: none;"> We discuss several aspects of the loss landscape of regularized neural networks: the structure of stationary points, connectivity of optimal solutions, path with nonincreasing loss to arbitrary global optimum, and the nonuniqueness of optimal solutions, by casting the problem into an equivalent convex problem and considering its dual. Starting from two-layer neural networks with scalar output, we first characterize the solution set of the convex problem using its dual and further characterize all stationary points. With the characterization, we show that the topology of the global optima goes through a phase transition as the width of the network changes, and construct counterexamples where the problem may have a continuum of optimal solutions. Finally, we show that the solution set characterization and connectivity results can be extended to different architectures, including two-layer vector-valued neural networks and parallel three-layer neural networks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07729v1-abstract-full').style.display = 'none'; document.getElementById('2411.07729v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07546">arXiv:2411.07546</a> <span> [<a href="https://arxiv.org/pdf/2411.07546">pdf</a>, <a href="https://arxiv.org/format/2411.07546">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Contrastive Language Prompting to Ease False Positives in Medical Anomaly Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Park%2C+Y">YeongHyeon Park</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+M+J">Myung Jin Kim</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+H+S">Hyeong Seok Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07546v1-abstract-short" style="display: inline;"> A pre-trained visual-language model, contrastive language-image pre-training (CLIP), successfully accomplishes various downstream tasks with text prompts, such as finding images or localizing regions within the image. Despite CLIP's strong multi-modal data capabilities, it remains limited in specialized environments, such as medical applications. For this purpose, many CLIP variants-i.e., BioMedCL… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07546v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07546v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07546v1-abstract-full" style="display: none;"> A pre-trained visual-language model, contrastive language-image pre-training (CLIP), successfully accomplishes various downstream tasks with text prompts, such as finding images or localizing regions within the image. Despite CLIP's strong multi-modal data capabilities, it remains limited in specialized environments, such as medical applications. For this purpose, many CLIP variants-i.e., BioMedCLIP, and MedCLIP-SAMv2-have emerged, but false positives related to normal regions persist. Thus, we aim to present a simple yet important goal of reducing false positives in medical anomaly detection. We introduce a Contrastive LAnguage Prompting (CLAP) method that leverages both positive and negative text prompts. This straightforward approach identifies potential lesion regions by visual attention to the positive prompts in the given image. To reduce false positives, we attenuate attention on normal regions using negative prompts. Extensive experiments with the BMAD dataset, including six biomedical benchmarks, demonstrate that CLAP method enhances anomaly detection performance. Our future plans include developing an automated fine prompting method for more practical usage. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07546v1-abstract-full').style.display = 'none'; document.getElementById('2411.07546v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">4 pages, 3 figures, 2 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07451">arXiv:2411.07451</a> <span> [<a href="https://arxiv.org/pdf/2411.07451">pdf</a>, <a href="https://arxiv.org/format/2411.07451">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Optimizing Data Delivery: Insights from User Preferences on Visuals, Tables, and Text </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Luera%2C+R">Reuben Luera</a>, <a href="/search/cs?searchtype=author&query=Rossi%2C+R">Ryan Rossi</a>, <a href="/search/cs?searchtype=author&query=Dernoncourt%2C+F">Franck Dernoncourt</a>, <a href="/search/cs?searchtype=author&query=Siu%2C+A">Alexa Siu</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+S">Sungchul Kim</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+T">Tong Yu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Ruiyi Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xiang Chen</a>, <a href="/search/cs?searchtype=author&query=Lipka%2C+N">Nedim Lipka</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhehao Zhang</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+S+G">Seon Gyeom Kim</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+T+Y">Tak Yeon Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07451v1-abstract-short" style="display: inline;"> In this work, we research user preferences to see a chart, table, or text given a question asked by the user. This enables us to understand when it is best to show a chart, table, or text to the user for the specific question. For this, we conduct a user study where users are shown a question and asked what they would prefer to see and used the data to establish that a user's personal traits does… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07451v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07451v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07451v1-abstract-full" style="display: none;"> In this work, we research user preferences to see a chart, table, or text given a question asked by the user. This enables us to understand when it is best to show a chart, table, or text to the user for the specific question. For this, we conduct a user study where users are shown a question and asked what they would prefer to see and used the data to establish that a user's personal traits does influence the data outputs that they prefer. Understanding how user characteristics impact a user's preferences is critical to creating data tools with a better user experience. Additionally, we investigate to what degree an LLM can be used to replicate a user's preference with and without user preference data. Overall, these findings have significant implications pertaining to the development of data tools and the replication of human preferences using LLMs. Furthermore, this work demonstrates the potential use of LLMs to replicate user preference data which has major implications for future user modeling and personalization research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07451v1-abstract-full').style.display = 'none'; document.getElementById('2411.07451v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05832">arXiv:2411.05832</a> <span> [<a href="https://arxiv.org/pdf/2411.05832">pdf</a>, <a href="https://arxiv.org/format/2411.05832">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> Diversify, Contextualize, and Adapt: Efficient Entropy Modeling for Neural Image Codec </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kim%2C+J">Jun-Hyuk Kim</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+S">Seungeon Kim</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+W">Won-Hee Lee</a>, <a href="/search/cs?searchtype=author&query=Oh%2C+D">Dokwan Oh</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05832v1-abstract-short" style="display: inline;"> Designing a fast and effective entropy model is challenging but essential for practical application of neural codecs. Beyond spatial autoregressive entropy models, more efficient backward adaptation-based entropy models have been recently developed. They not only reduce decoding time by using smaller number of modeling steps but also maintain or even improve rate--distortion performance by leverag… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05832v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05832v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05832v1-abstract-full" style="display: none;"> Designing a fast and effective entropy model is challenging but essential for practical application of neural codecs. Beyond spatial autoregressive entropy models, more efficient backward adaptation-based entropy models have been recently developed. They not only reduce decoding time by using smaller number of modeling steps but also maintain or even improve rate--distortion performance by leveraging more diverse contexts for backward adaptation. Despite their significant progress, we argue that their performance has been limited by the simple adoption of the design convention for forward adaptation: using only a single type of hyper latent representation, which does not provide sufficient contextual information, especially in the first modeling step. In this paper, we propose a simple yet effective entropy modeling framework that leverages sufficient contexts for forward adaptation without compromising on bit-rate. Specifically, we introduce a strategy of diversifying hyper latent representations for forward adaptation, i.e., using two additional types of contexts along with the existing single type of context. In addition, we present a method to effectively use the diverse contexts for contextualizing the current elements to be encoded/decoded. By addressing the limitation of the previous approach, our proposed framework leads to significant performance improvements. Experimental results on popular datasets show that our proposed framework consistently improves rate--distortion performance across various bit-rate regions, e.g., 3.73% BD-rate gain over the state-of-the-art baseline on the Kodak dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05832v1-abstract-full').style.display = 'none'; document.getElementById('2411.05832v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05811">arXiv:2411.05811</a> <span> [<a href="https://arxiv.org/pdf/2411.05811">pdf</a>, <a href="https://arxiv.org/format/2411.05811">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Neurons and Cognition">q-bio.NC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Neurophysiological Analysis in Motor and Sensory Cortices for Improving Motor Imagination </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kim%2C+S">Si-Hyun Kim</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+S">Sung-Jin Kim</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+D">Dae-Hyeok Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05811v1-abstract-short" style="display: inline;"> Brain-computer interface (BCI) enables direct communication between the brain and external devices by decoding neural signals, offering potential solutions for individuals with motor impairments. This study explores the neural signatures of motor execution (ME) and motor imagery (MI) tasks using EEG signals, focusing on four conditions categorized as sense-related (hot and cold) and motor-related… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05811v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05811v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05811v1-abstract-full" style="display: none;"> Brain-computer interface (BCI) enables direct communication between the brain and external devices by decoding neural signals, offering potential solutions for individuals with motor impairments. This study explores the neural signatures of motor execution (ME) and motor imagery (MI) tasks using EEG signals, focusing on four conditions categorized as sense-related (hot and cold) and motor-related (pull and push) conditions. We conducted scalp topography analysis to examine activation patterns in the sensorimotor cortex, revealing distinct regional differences: sense--related conditions primarily activated the posterior region of the sensorimotor cortex, while motor--related conditions activated the anterior region of the sensorimotor cortex. These spatial distinctions align with neurophysiological principles, suggesting condition-specific functional subdivisions within the sensorimotor cortex. We further evaluated the performances of three neural network models-EEGNet, ShallowConvNet, and DeepConvNet-demonstrating that ME tasks achieved higher classification accuracies compared to MI tasks. Specifically, in sense-related conditions, the highest accuracy was observed in the cold condition. In motor-related conditions, the pull condition showed the highest performance, with DeepConvNet yielding the highest results. These findings provide insights into optimizing BCI applications by leveraging specific condition-induced neural activations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05811v1-abstract-full').style.display = 'none'; document.getElementById('2411.05811v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">4 pages, 3 figures, 1 table, Name of Conference: International Winter Conference on Brain-Computer Interface</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05165">arXiv:2411.05165</a> <span> [<a href="https://arxiv.org/pdf/2411.05165">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Haptic Dial based on Magnetorheological Fluid Having Bumpy Structure </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lee%2C+S+H">Seok Hun Lee</a>, <a href="/search/cs?searchtype=author&query=Heo%2C+Y+H">Yong Hae Heo</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+S">Seok-Han Lee</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+S">Sang-Youn Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05165v1-abstract-short" style="display: inline;"> We proposed a haptic dial based on magnetorheological fluid (MRF) which enhances performance by increasing the MRF-exposed area through concave shaft and housing structure. We developed a breakout-style game to show that the proposed haptic dial allows users to efficiently interact with virtual objects. </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05165v1-abstract-full" style="display: none;"> We proposed a haptic dial based on magnetorheological fluid (MRF) which enhances performance by increasing the MRF-exposed area through concave shaft and housing structure. We developed a breakout-style game to show that the proposed haptic dial allows users to efficiently interact with virtual objects. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05165v1-abstract-full').style.display = 'none'; document.getElementById('2411.05165v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Part of proceedings of 6th International Conference AsiaHaptics 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05153">arXiv:2411.05153</a> <span> [<a href="https://arxiv.org/pdf/2411.05153">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Wearable Haptic Device to Render 360-degree Torque Feedback on the Wrist </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kim%2C+S">Seungchae Kim</a>, <a href="/search/cs?searchtype=author&query=Hashem%2C+M+S">Mohammad Shadman Hashem</a>, <a href="/search/cs?searchtype=author&query=Jeon%2C+S">Seokhee Jeon</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05153v1-abstract-short" style="display: inline;"> Haptic feedback increases the realism of virtual environments. This paper proposes a wearable haptic device that renders torque feedback to the user's wrist from any angle. The device comprises a control part and a handle part. The control part consists of three DC gear motors and a microcontroller, while the handle part securely holds the Oculus Quest 2 right controller. The control part manages… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05153v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05153v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05153v1-abstract-full" style="display: none;"> Haptic feedback increases the realism of virtual environments. This paper proposes a wearable haptic device that renders torque feedback to the user's wrist from any angle. The device comprises a control part and a handle part. The control part consists of three DC gear motors and a microcontroller, while the handle part securely holds the Oculus Quest 2 right controller. The control part manages string tension to deliver the sensation of torque feedback during interactions with virtual tools or objects. The three points of the handle part are connected to the three motors of the control part via strings, which pull the handle part to render precise 360-degree (yaw and pitch) torque feedback to the user's wrist. Finally, to show the effectiveness of the proposed device, two VR demos were implemented- Shooting Game and Shielding Experience. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05153v1-abstract-full').style.display = 'none'; document.getElementById('2411.05153v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Part of proceedings of 6th International Conference AsiaHaptics 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02625">arXiv:2411.02625</a> <span> [<a href="https://arxiv.org/pdf/2411.02625">pdf</a>, <a href="https://arxiv.org/format/2411.02625">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> EmoSphere++: Emotion-Controllable Zero-Shot Text-to-Speech via Emotion-Adaptive Spherical Vector </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cho%2C+D">Deok-Hyeon Cho</a>, <a href="/search/cs?searchtype=author&query=Oh%2C+H">Hyung-Seok Oh</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+S">Seung-Bin Kim</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+S">Seong-Whan Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02625v1-abstract-short" style="display: inline;"> Emotional text-to-speech (TTS) technology has achieved significant progress in recent years; however, challenges remain owing to the inherent complexity of emotions and limitations of the available emotional speech datasets and models. Previous studies typically relied on limited emotional speech datasets or required extensive manual annotations, restricting their ability to generalize across diff… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02625v1-abstract-full').style.display = 'inline'; document.getElementById('2411.02625v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02625v1-abstract-full" style="display: none;"> Emotional text-to-speech (TTS) technology has achieved significant progress in recent years; however, challenges remain owing to the inherent complexity of emotions and limitations of the available emotional speech datasets and models. Previous studies typically relied on limited emotional speech datasets or required extensive manual annotations, restricting their ability to generalize across different speakers and emotional styles. In this paper, we present EmoSphere++, an emotion-controllable zero-shot TTS model that can control emotional style and intensity to resemble natural human speech. We introduce a novel emotion-adaptive spherical vector that models emotional style and intensity without human annotation. Moreover, we propose a multi-level style encoder that can ensure effective generalization for both seen and unseen speakers. We also introduce additional loss functions to enhance the emotion transfer performance for zero-shot scenarios. We employ a conditional flow matching-based decoder to achieve high-quality and expressive emotional TTS in a few sampling steps. Experimental results demonstrate the effectiveness of the proposed framework. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02625v1-abstract-full').style.display = 'none'; document.getElementById('2411.02625v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02225">arXiv:2411.02225</a> <span> [<a href="https://arxiv.org/pdf/2411.02225">pdf</a>, <a href="https://arxiv.org/ps/2411.02225">ps</a>, <a href="https://arxiv.org/format/2411.02225">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Statistics Theory">math.ST</span> </div> </div> <p class="title is-5 mathjax"> Variable Selection in Convex Piecewise Linear Regression </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kanj%2C+H">Haitham Kanj</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+S">Seonho Kim</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+K">Kiryung Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02225v1-abstract-short" style="display: inline;"> This paper presents Sparse Gradient Descent as a solution for variable selection in convex piecewise linear regression where the model is given as $\mathrm{max}\langle a_j^\star, x \rangle + b_j^\star$ for $j = 1,\dots,k$ where $x \in \mathbb R^d$ is the covariate vector. Here, $\{a_j^\star\}_{j=1}^k$ and $\{b_j^\star\}_{j=1}^k$ denote the ground-truth weight vectors and intercepts. A non-asymptot… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02225v1-abstract-full').style.display = 'inline'; document.getElementById('2411.02225v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02225v1-abstract-full" style="display: none;"> This paper presents Sparse Gradient Descent as a solution for variable selection in convex piecewise linear regression where the model is given as $\mathrm{max}\langle a_j^\star, x \rangle + b_j^\star$ for $j = 1,\dots,k$ where $x \in \mathbb R^d$ is the covariate vector. Here, $\{a_j^\star\}_{j=1}^k$ and $\{b_j^\star\}_{j=1}^k$ denote the ground-truth weight vectors and intercepts. A non-asymptotic local convergence analysis is provided for Sp-GD under sub-Gaussian noise when the covariate distribution satisfies sub-Gaussianity and anti-concentration property. When the model order and parameters are fixed, Sp-GD provides an $蔚$-accurate estimate given $\mathcal{O}(\max(蔚^{-2}蟽_z^2,1)s\log(d/s))$ observations where $蟽_z^2$ denotes the noise variance. This also implies the exact parameter recovery by Sp-GD from $\mathcal{O}(s\log(d/s))$ noise-free observations. Since optimizing the squared loss for sparse max-affine is non-convex, an initialization scheme is proposed to provide a suitable initial estimate within the basin of attraction for Sp-GD, i.e. sufficiently accurate to invoke the convergence guarantees. The initialization scheme uses sparse principal component analysis to estimate the subspace spanned by $\{ a_j^\star\}_{j=1}^k$ then applies an $r$-covering search to estimate the model parameters. A non-asymptotic analysis is presented for this initialization scheme when the covariates and noise samples follow Gaussian distributions. When the model order and parameters are fixed, this initialization scheme provides an $蔚$-accurate estimate given $\mathcal{O}(蔚^{-2}\max(蟽_z^4,蟽_z^2,1)s^2\log^4(d))$ observations. Numerical Monte Carlo results corroborate theoretical findings for Sp-GD and the initialization scheme. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02225v1-abstract-full').style.display = 'none'; document.getElementById('2411.02225v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01801">arXiv:2411.01801</a> <span> [<a href="https://arxiv.org/pdf/2411.01801">pdf</a>, <a href="https://arxiv.org/format/2411.01801">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Bootstrapping Top-down Information for Self-modulating Slot Attention </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kim%2C+D">Dongwon Kim</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+S">Seoyeon Kim</a>, <a href="/search/cs?searchtype=author&query=Kwak%2C+S">Suha Kwak</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01801v2-abstract-short" style="display: inline;"> Object-centric learning (OCL) aims to learn representations of individual objects within visual scenes without manual supervision, facilitating efficient and effective visual reasoning. Traditional OCL methods primarily employ bottom-up approaches that aggregate homogeneous visual features to represent objects. However, in complex visual environments, these methods often fall short due to the hete… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01801v2-abstract-full').style.display = 'inline'; document.getElementById('2411.01801v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01801v2-abstract-full" style="display: none;"> Object-centric learning (OCL) aims to learn representations of individual objects within visual scenes without manual supervision, facilitating efficient and effective visual reasoning. Traditional OCL methods primarily employ bottom-up approaches that aggregate homogeneous visual features to represent objects. However, in complex visual environments, these methods often fall short due to the heterogeneous nature of visual features within an object. To address this, we propose a novel OCL framework incorporating a top-down pathway. This pathway first bootstraps the semantics of individual objects and then modulates the model to prioritize features relevant to these semantics. By dynamically modulating the model based on its own output, our top-down pathway enhances the representational quality of objects. Our framework achieves state-of-the-art performance across multiple synthetic and real-world object-discovery benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01801v2-abstract-full').style.display = 'none'; document.getElementById('2411.01801v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01757">arXiv:2411.01757</a> <span> [<a href="https://arxiv.org/pdf/2411.01757">pdf</a>, <a href="https://arxiv.org/format/2411.01757">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Mitigating Spurious Correlations via Disagreement Probability </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Han%2C+H">Hyeonggeun Han</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+S">Sehwan Kim</a>, <a href="/search/cs?searchtype=author&query=Joo%2C+H">Hyungjun Joo</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+S">Sangwoo Hong</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+J">Jungwoo Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01757v1-abstract-short" style="display: inline;"> Models trained with empirical risk minimization (ERM) are prone to be biased towards spurious correlations between target labels and bias attributes, which leads to poor performance on data groups lacking spurious correlations. It is particularly challenging to address this problem when access to bias labels is not permitted. To mitigate the effect of spurious correlations without bias labels, we… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01757v1-abstract-full').style.display = 'inline'; document.getElementById('2411.01757v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01757v1-abstract-full" style="display: none;"> Models trained with empirical risk minimization (ERM) are prone to be biased towards spurious correlations between target labels and bias attributes, which leads to poor performance on data groups lacking spurious correlations. It is particularly challenging to address this problem when access to bias labels is not permitted. To mitigate the effect of spurious correlations without bias labels, we first introduce a novel training objective designed to robustly enhance model performance across all data samples, irrespective of the presence of spurious correlations. From this objective, we then derive a debiasing method, Disagreement Probability based Resampling for debiasing (DPR), which does not require bias labels. DPR leverages the disagreement between the target label and the prediction of a biased model to identify bias-conflicting samples-those without spurious correlations-and upsamples them according to the disagreement probability. Empirical evaluations on multiple benchmarks demonstrate that DPR achieves state-of-the-art performance over existing baselines that do not use bias labels. Furthermore, we provide a theoretical analysis that details how DPR reduces dependency on spurious correlations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01757v1-abstract-full').style.display = 'none'; document.getElementById('2411.01757v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00578">arXiv:2411.00578</a> <span> [<a href="https://arxiv.org/pdf/2411.00578">pdf</a>, <a href="https://arxiv.org/format/2411.00578">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Federated Voxel Scene Graph for Intracranial Hemorrhage </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sanner%2C+A+P">Antoine P. Sanner</a>, <a href="/search/cs?searchtype=author&query=Stieber%2C+J">Jonathan Stieber</a>, <a href="/search/cs?searchtype=author&query=Grauhan%2C+N+F">Nils F. Grauhan</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+S">Suam Kim</a>, <a href="/search/cs?searchtype=author&query=Brockmann%2C+M+A">Marc A. Brockmann</a>, <a href="/search/cs?searchtype=author&query=Othman%2C+A+E">Ahmed E. Othman</a>, <a href="/search/cs?searchtype=author&query=Mukhopadhyay%2C+A">Anirban Mukhopadhyay</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00578v1-abstract-short" style="display: inline;"> Intracranial Hemorrhage is a potentially lethal condition whose manifestation is vastly diverse and shifts across clinical centers worldwide. Deep-learning-based solutions are starting to model complex relations between brain structures, but still struggle to generalize. While gathering more diverse data is the most natural approach, privacy regulations often limit the sharing of medical data. We… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00578v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00578v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00578v1-abstract-full" style="display: none;"> Intracranial Hemorrhage is a potentially lethal condition whose manifestation is vastly diverse and shifts across clinical centers worldwide. Deep-learning-based solutions are starting to model complex relations between brain structures, but still struggle to generalize. While gathering more diverse data is the most natural approach, privacy regulations often limit the sharing of medical data. We propose the first application of Federated Scene Graph Generation. We show that our models can leverage the increased training data diversity. For Scene Graph Generation, they can recall up to 20% more clinically relevant relations across datasets compared to models trained on a single centralized dataset. Learning structured data representation in a federated setting can open the way to the development of new methods that can leverage this finer information to regularize across clients more effectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00578v1-abstract-full').style.display = 'none'; document.getElementById('2411.00578v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68T07 <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.10 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00360">arXiv:2411.00360</a> <span> [<a href="https://arxiv.org/pdf/2411.00360">pdf</a>, <a href="https://arxiv.org/format/2411.00360">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> A Simple Remedy for Dataset Bias via Self-Influence: A Mislabeled Sample Perspective </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jung%2C+Y">Yeonsung Jung</a>, <a href="/search/cs?searchtype=author&query=Song%2C+J">Jaeyun Song</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J+Y">June Yong Yang</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+J">Jin-Hwa Kim</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+S">Sung-Yub Kim</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+E">Eunho Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00360v1-abstract-short" style="display: inline;"> Learning generalized models from biased data is an important undertaking toward fairness in deep learning. To address this issue, recent studies attempt to identify and leverage bias-conflicting samples free from spurious correlations without prior knowledge of bias or an unbiased set. However, spurious correlation remains an ongoing challenge, primarily due to the difficulty in precisely detectin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00360v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00360v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00360v1-abstract-full" style="display: none;"> Learning generalized models from biased data is an important undertaking toward fairness in deep learning. To address this issue, recent studies attempt to identify and leverage bias-conflicting samples free from spurious correlations without prior knowledge of bias or an unbiased set. However, spurious correlation remains an ongoing challenge, primarily due to the difficulty in precisely detecting these samples. In this paper, inspired by the similarities between mislabeled samples and bias-conflicting samples, we approach this challenge from a novel perspective of mislabeled sample detection. Specifically, we delve into Influence Function, one of the standard methods for mislabeled sample detection, for identifying bias-conflicting samples and propose a simple yet effective remedy for biased models by leveraging them. Through comprehensive analysis and experiments on diverse datasets, we demonstrate that our new perspective can boost the precision of detection and rectify biased models effectively. Furthermore, our approach is complementary to existing methods, showing performance improvement even when applied to models that have already undergone recent debiasing techniques. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00360v1-abstract-full').style.display = 'none'; document.getElementById('2411.00360v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00322">arXiv:2411.00322</a> <span> [<a href="https://arxiv.org/pdf/2411.00322">pdf</a>, <a href="https://arxiv.org/format/2411.00322">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Constant Acceleration Flow </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Park%2C+D">Dogyun Park</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+S">Sojin Lee</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+S">Sihyeon Kim</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+T">Taehoon Lee</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+Y">Youngjoon Hong</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+H+J">Hyunwoo J. Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00322v1-abstract-short" style="display: inline;"> Rectified flow and reflow procedures have significantly advanced fast generation by progressively straightening ordinary differential equation (ODE) flows. They operate under the assumption that image and noise pairs, known as couplings, can be approximated by straight trajectories with constant velocity. However, we observe that modeling with constant velocity and using reflow procedures have lim… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00322v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00322v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00322v1-abstract-full" style="display: none;"> Rectified flow and reflow procedures have significantly advanced fast generation by progressively straightening ordinary differential equation (ODE) flows. They operate under the assumption that image and noise pairs, known as couplings, can be approximated by straight trajectories with constant velocity. However, we observe that modeling with constant velocity and using reflow procedures have limitations in accurately learning straight trajectories between pairs, resulting in suboptimal performance in few-step generation. To address these limitations, we introduce Constant Acceleration Flow (CAF), a novel framework based on a simple constant acceleration equation. CAF introduces acceleration as an additional learnable variable, allowing for more expressive and accurate estimation of the ODE flow. Moreover, we propose two techniques to further improve estimation accuracy: initial velocity conditioning for the acceleration model and a reflow process for the initial velocity. Our comprehensive studies on toy datasets, CIFAR-10, and ImageNet 64x64 demonstrate that CAF outperforms state-of-the-art baselines for one-step generation. We also show that CAF dramatically improves few-step coupling preservation and inversion over Rectified flow. Code is available at \href{https://github.com/mlvlab/CAF}{https://github.com/mlvlab/CAF}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00322v1-abstract-full').style.display = 'none'; document.getElementById('2411.00322v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00027">arXiv:2411.00027</a> <span> [<a href="https://arxiv.org/pdf/2411.00027">pdf</a>, <a href="https://arxiv.org/format/2411.00027">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Personalization of Large Language Models: A Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhehao Zhang</a>, <a href="/search/cs?searchtype=author&query=Rossi%2C+R+A">Ryan A. Rossi</a>, <a href="/search/cs?searchtype=author&query=Kveton%2C+B">Branislav Kveton</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+Y">Yijia Shao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+D">Diyi Yang</a>, <a href="/search/cs?searchtype=author&query=Zamani%2C+H">Hamed Zamani</a>, <a href="/search/cs?searchtype=author&query=Dernoncourt%2C+F">Franck Dernoncourt</a>, <a href="/search/cs?searchtype=author&query=Barrow%2C+J">Joe Barrow</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+T">Tong Yu</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+S">Sungchul Kim</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Ruiyi Zhang</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+J">Jiuxiang Gu</a>, <a href="/search/cs?searchtype=author&query=Derr%2C+T">Tyler Derr</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hongjie Chen</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Junda Wu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xiang Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zichao Wang</a>, <a href="/search/cs?searchtype=author&query=Mitra%2C+S">Subrata Mitra</a>, <a href="/search/cs?searchtype=author&query=Lipka%2C+N">Nedim Lipka</a>, <a href="/search/cs?searchtype=author&query=Ahmed%2C+N">Nesreen Ahmed</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yu Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00027v1-abstract-short" style="display: inline;"> Personalization of Large Language Models (LLMs) has recently become increasingly important with a wide range of applications. Despite the importance and recent progress, most existing works on personalized LLMs have focused either entirely on (a) personalized text generation or (b) leveraging LLMs for personalization-related downstream applications, such as recommendation systems. In this work, we… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00027v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00027v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00027v1-abstract-full" style="display: none;"> Personalization of Large Language Models (LLMs) has recently become increasingly important with a wide range of applications. Despite the importance and recent progress, most existing works on personalized LLMs have focused either entirely on (a) personalized text generation or (b) leveraging LLMs for personalization-related downstream applications, such as recommendation systems. In this work, we bridge the gap between these two separate main directions for the first time by introducing a taxonomy for personalized LLM usage and summarizing the key differences and challenges. We provide a formalization of the foundations of personalized LLMs that consolidates and expands notions of personalization of LLMs, defining and discussing novel facets of personalization, usage, and desiderata of personalized LLMs. We then unify the literature across these diverse fields and usage scenarios by proposing systematic taxonomies for the granularity of personalization, personalization techniques, datasets, evaluation methods, and applications of personalized LLMs. Finally, we highlight challenges and important open problems that remain to be addressed. By unifying and surveying recent research using the proposed taxonomies, we aim to provide a clear guide to the existing literature and different facets of personalization in LLMs, empowering both researchers and practitioners. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00027v1-abstract-full').style.display = 'none'; document.getElementById('2411.00027v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23629">arXiv:2410.23629</a> <span> [<a href="https://arxiv.org/pdf/2410.23629">pdf</a>, <a href="https://arxiv.org/format/2410.23629">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Posture-Informed Muscular Force Learning for Robust Hand Pressure Estimation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Seo%2C+K">Kyungjin Seo</a>, <a href="/search/cs?searchtype=author&query=Seo%2C+J">Junghoon Seo</a>, <a href="/search/cs?searchtype=author&query=Jeong%2C+H">Hanseok Jeong</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+S">Sangpil Kim</a>, <a href="/search/cs?searchtype=author&query=Yoon%2C+S+H">Sang Ho Yoon</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23629v2-abstract-short" style="display: inline;"> We present PiMForce, a novel framework that enhances hand pressure estimation by leveraging 3D hand posture information to augment forearm surface electromyography (sEMG) signals. Our approach utilizes detailed spatial information from 3D hand poses in conjunction with dynamic muscle activity from sEMG to enable accurate and robust whole-hand pressure measurements under diverse hand-object interac… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23629v2-abstract-full').style.display = 'inline'; document.getElementById('2410.23629v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23629v2-abstract-full" style="display: none;"> We present PiMForce, a novel framework that enhances hand pressure estimation by leveraging 3D hand posture information to augment forearm surface electromyography (sEMG) signals. Our approach utilizes detailed spatial information from 3D hand poses in conjunction with dynamic muscle activity from sEMG to enable accurate and robust whole-hand pressure measurements under diverse hand-object interactions. We also developed a multimodal data collection system that combines a pressure glove, an sEMG armband, and a markerless finger-tracking module. We created a comprehensive dataset from 21 participants, capturing synchronized data of hand posture, sEMG signals, and exerted hand pressure across various hand postures and hand-object interaction scenarios using our collection system. Our framework enables precise hand pressure estimation in complex and natural interaction scenarios. Our approach substantially mitigates the limitations of traditional sEMG-based or vision-based methods by integrating 3D hand posture information with sEMG signals. Video demos, data, and code are available online. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23629v2-abstract-full').style.display = 'none'; document.getElementById('2410.23629v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to NeurIPS 2024. Project Page Link: https://pimforce.hcitech.org/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23413">arXiv:2410.23413</a> <span> [<a href="https://arxiv.org/pdf/2410.23413">pdf</a>, <a href="https://arxiv.org/format/2410.23413">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> EchoFM: Foundation Model for Generalizable Echocardiogram Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kim%2C+S">Sekeun Kim</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+P">Pengfei Jin</a>, <a href="/search/cs?searchtype=author&query=Song%2C+S">Sifan Song</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C">Cheng Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yiwei Li</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+H">Hui Ren</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiang Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+T">Tianming Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Q">Quanzheng Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23413v1-abstract-short" style="display: inline;"> Foundation models have recently gained significant attention because of their generalizability and adaptability across multiple tasks and data distributions. Although medical foundation models have emerged, solutions for cardiac imaging, especially echocardiography videos, are still unexplored. In this paper, we introduce EchoFM, a foundation model specifically designed to represent and analyze ec… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23413v1-abstract-full').style.display = 'inline'; document.getElementById('2410.23413v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23413v1-abstract-full" style="display: none;"> Foundation models have recently gained significant attention because of their generalizability and adaptability across multiple tasks and data distributions. Although medical foundation models have emerged, solutions for cardiac imaging, especially echocardiography videos, are still unexplored. In this paper, we introduce EchoFM, a foundation model specifically designed to represent and analyze echocardiography videos. In EchoFM, we propose a self-supervised learning framework that captures both spatial and temporal variability patterns through a spatio-temporal consistent masking strategy and periodic-driven contrastive learning. This framework can effectively capture the spatio-temporal dynamics of echocardiography and learn the representative video features without any labels. We pre-train our model on an extensive dataset comprising over 290,000 echocardiography videos covering 26 scan views across different imaging modes, with up to 20 million frames of images. The pre-trained EchoFM can then be easily adapted and fine-tuned for a variety of downstream tasks, serving as a robust backbone model. Our evaluation was systemically designed for four downstream tasks after the echocardiography examination routine. Experiment results show that EchoFM surpasses state-of-the-art methods, including specialized echocardiography methods, self-supervised pre-training models, and general-purposed pre-trained foundation models, across all downstream tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23413v1-abstract-full').style.display = 'none'; document.getElementById('2410.23413v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23200">arXiv:2410.23200</a> <span> [<a href="https://arxiv.org/pdf/2410.23200">pdf</a>, <a href="https://arxiv.org/format/2410.23200">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> HEX: Hierarchical Emergence Exploitation in Self-Supervised Algorithms </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kokilepersaud%2C+K">Kiran Kokilepersaud</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+S">Seulgi Kim</a>, <a href="/search/cs?searchtype=author&query=Prabhushankar%2C+M">Mohit Prabhushankar</a>, <a href="/search/cs?searchtype=author&query=AlRegib%2C+G">Ghassan AlRegib</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23200v1-abstract-short" style="display: inline;"> In this paper, we propose an algorithm that can be used on top of a wide variety of self-supervised (SSL) approaches to take advantage of hierarchical structures that emerge during training. SSL approaches typically work through some invariance term to ensure consistency between similar samples and a regularization term to prevent global dimensional collapse. Dimensional collapse refers to data re… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23200v1-abstract-full').style.display = 'inline'; document.getElementById('2410.23200v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23200v1-abstract-full" style="display: none;"> In this paper, we propose an algorithm that can be used on top of a wide variety of self-supervised (SSL) approaches to take advantage of hierarchical structures that emerge during training. SSL approaches typically work through some invariance term to ensure consistency between similar samples and a regularization term to prevent global dimensional collapse. Dimensional collapse refers to data representations spanning a lower-dimensional subspace. Recent work has demonstrated that the representation space of these algorithms gradually reflects a semantic hierarchical structure as training progresses. Data samples of the same hierarchical grouping tend to exhibit greater dimensional collapse locally compared to the dataset as a whole due to sharing features in common with each other. Ideally, SSL algorithms would take advantage of this hierarchical emergence to have an additional regularization term to account for this local dimensional collapse effect. However, the construction of existing SSL algorithms does not account for this property. To address this, we propose an adaptive algorithm that performs a weighted decomposition of the denominator of the InfoNCE loss into two terms: local hierarchical and global collapse regularization respectively. This decomposition is based on an adaptive threshold that gradually lowers to reflect the emerging hierarchical structure of the representation space throughout training. It is based on an analysis of the cosine similarity distribution of samples in a batch. We demonstrate that this hierarchical emergence exploitation (HEX) approach can be integrated across a wide variety of SSL algorithms. Empirically, we show performance improvements of up to 5.6% relative improvement over baseline SSL approaches on classification accuracy on Imagenet with 100 epochs of training. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23200v1-abstract-full').style.display = 'none'; document.getElementById('2410.23200v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> 2025 Winter Applications of Computer Vision (WACV) </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.22918">arXiv:2410.22918</a> <span> [<a href="https://arxiv.org/pdf/2410.22918">pdf</a>, <a href="https://arxiv.org/format/2410.22918">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Simulation-Free Training of Neural ODEs on Paired Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kim%2C+S">Semin Kim</a>, <a href="/search/cs?searchtype=author&query=Yoo%2C+J">Jaehoon Yoo</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+J">Jinwoo Kim</a>, <a href="/search/cs?searchtype=author&query=Cha%2C+Y">Yeonwoo Cha</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+S">Saehoon Kim</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+S">Seunghoon Hong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.22918v1-abstract-short" style="display: inline;"> In this work, we investigate a method for simulation-free training of Neural Ordinary Differential Equations (NODEs) for learning deterministic mappings between paired data. Despite the analogy of NODEs as continuous-depth residual networks, their application in typical supervised learning tasks has not been popular, mainly due to the large number of function evaluations required by ODE solvers an… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22918v1-abstract-full').style.display = 'inline'; document.getElementById('2410.22918v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.22918v1-abstract-full" style="display: none;"> In this work, we investigate a method for simulation-free training of Neural Ordinary Differential Equations (NODEs) for learning deterministic mappings between paired data. Despite the analogy of NODEs as continuous-depth residual networks, their application in typical supervised learning tasks has not been popular, mainly due to the large number of function evaluations required by ODE solvers and numerical instability in gradient estimation. To alleviate this problem, we employ the flow matching framework for simulation-free training of NODEs, which directly regresses the parameterized dynamics function to a predefined target velocity field. Contrary to generative tasks, however, we show that applying flow matching directly between paired data can often lead to an ill-defined flow that breaks the coupling of the data pairs (e.g., due to crossing trajectories). We propose a simple extension that applies flow matching in the embedding space of data pairs, where the embeddings are learned jointly with the dynamic function to ensure the validity of the flow which is also easier to learn. We demonstrate the effectiveness of our method on both regression and classification tasks, where our method outperforms existing NODEs with a significantly lower number of function evaluations. The code is available at https://github.com/seminkim/simulation-free-node. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22918v1-abstract-full').style.display = 'none'; document.getElementById('2410.22918v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.22461">arXiv:2410.22461</a> <span> [<a href="https://arxiv.org/pdf/2410.22461">pdf</a>, <a href="https://arxiv.org/format/2410.22461">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Unified Domain Generalization and Adaptation for Multi-View 3D Object Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chang%2C+G">Gyusam Chang</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+J">Jiwon Lee</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+D">Donghyun Kim</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+J">Jinkyu Kim</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+D">Dongwook Lee</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+D">Daehyun Ji</a>, <a href="/search/cs?searchtype=author&query=Jang%2C+S">Sujin Jang</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+S">Sangpil Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.22461v1-abstract-short" style="display: inline;"> Recent advances in 3D object detection leveraging multi-view cameras have demonstrated their practical and economical value in various challenging vision tasks. However, typical supervised learning approaches face challenges in achieving satisfactory adaptation toward unseen and unlabeled target datasets (\ie, direct transfer) due to the inevitable geometric misalignment between the source and tar… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22461v1-abstract-full').style.display = 'inline'; document.getElementById('2410.22461v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.22461v1-abstract-full" style="display: none;"> Recent advances in 3D object detection leveraging multi-view cameras have demonstrated their practical and economical value in various challenging vision tasks. However, typical supervised learning approaches face challenges in achieving satisfactory adaptation toward unseen and unlabeled target datasets (\ie, direct transfer) due to the inevitable geometric misalignment between the source and target domains. In practice, we also encounter constraints on resources for training models and collecting annotations for the successful deployment of 3D object detectors. In this paper, we propose Unified Domain Generalization and Adaptation (UDGA), a practical solution to mitigate those drawbacks. We first propose Multi-view Overlap Depth Constraint that leverages the strong association between multi-view, significantly alleviating geometric gaps due to perspective view changes. Then, we present a Label-Efficient Domain Adaptation approach to handle unfamiliar targets with significantly fewer amounts of labels (\ie, 1$\%$ and 5$\%)$, while preserving well-defined source knowledge for training efficiency. Overall, UDGA framework enables stable detection performance in both source and target domains, effectively bridging inevitable domain gaps, while demanding fewer annotations. We demonstrate the robustness of UDGA with large-scale benchmarks: nuScenes, Lyft, and Waymo, where our framework outperforms the current state-of-the-art methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22461v1-abstract-full').style.display = 'none'; document.getElementById('2410.22461v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.22376">arXiv:2410.22376</a> <span> [<a href="https://arxiv.org/pdf/2410.22376">pdf</a>, <a href="https://arxiv.org/format/2410.22376">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Rare-to-Frequent: Unlocking Compositional Generation Power of Diffusion Models on Rare Concepts with LLM Guidance </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Park%2C+D">Dongmin Park</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+S">Sebin Kim</a>, <a href="/search/cs?searchtype=author&query=Moon%2C+T">Taehong Moon</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+M">Minkyu Kim</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+K">Kangwook Lee</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+J">Jaewoong Cho</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.22376v1-abstract-short" style="display: inline;"> State-of-the-art text-to-image (T2I) diffusion models often struggle to generate rare compositions of concepts, e.g., objects with unusual attributes. In this paper, we show that the compositional generation power of diffusion models on such rare concepts can be significantly enhanced by the Large Language Model (LLM) guidance. We start with empirical and theoretical analysis, demonstrating that e… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22376v1-abstract-full').style.display = 'inline'; document.getElementById('2410.22376v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.22376v1-abstract-full" style="display: none;"> State-of-the-art text-to-image (T2I) diffusion models often struggle to generate rare compositions of concepts, e.g., objects with unusual attributes. In this paper, we show that the compositional generation power of diffusion models on such rare concepts can be significantly enhanced by the Large Language Model (LLM) guidance. We start with empirical and theoretical analysis, demonstrating that exposing frequent concepts relevant to the target rare concepts during the diffusion sampling process yields more accurate concept composition. Based on this, we propose a training-free approach, R2F, that plans and executes the overall rare-to-frequent concept guidance throughout the diffusion inference by leveraging the abundant semantic knowledge in LLMs. Our framework is flexible across any pre-trained diffusion models and LLMs, and can be seamlessly integrated with the region-guided diffusion approaches. Extensive experiments on three datasets, including our newly proposed benchmark, RareBench, containing various prompts with rare compositions of concepts, R2F significantly surpasses existing models including SD3.0 and FLUX by up to 28.1%p in T2I alignment. Code is available at https://github.com/krafton-ai/Rare2Frequent. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22376v1-abstract-full').style.display = 'none'; document.getElementById('2410.22376v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.22370">arXiv:2410.22370</a> <span> [<a href="https://arxiv.org/pdf/2410.22370">pdf</a>, <a href="https://arxiv.org/format/2410.22370">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Survey of User Interface Design and Interaction Techniques in Generative AI Applications </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Luera%2C+R">Reuben Luera</a>, <a href="/search/cs?searchtype=author&query=Rossi%2C+R+A">Ryan A. Rossi</a>, <a href="/search/cs?searchtype=author&query=Siu%2C+A">Alexa Siu</a>, <a href="/search/cs?searchtype=author&query=Dernoncourt%2C+F">Franck Dernoncourt</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+T">Tong Yu</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+S">Sungchul Kim</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Ruiyi Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xiang Chen</a>, <a href="/search/cs?searchtype=author&query=Salehy%2C+H">Hanieh Salehy</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+J">Jian Zhao</a>, <a href="/search/cs?searchtype=author&query=Basu%2C+S">Samyadeep Basu</a>, <a href="/search/cs?searchtype=author&query=Mathur%2C+P">Puneet Mathur</a>, <a href="/search/cs?searchtype=author&query=Lipka%2C+N">Nedim Lipka</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.22370v1-abstract-short" style="display: inline;"> The applications of generative AI have become extremely impressive, and the interplay between users and AI is even more so. Current human-AI interaction literature has taken a broad look at how humans interact with generative AI, but it lacks specificity regarding the user interface designs and patterns used to create these applications. Therefore, we present a survey that comprehensively presents… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22370v1-abstract-full').style.display = 'inline'; document.getElementById('2410.22370v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.22370v1-abstract-full" style="display: none;"> The applications of generative AI have become extremely impressive, and the interplay between users and AI is even more so. Current human-AI interaction literature has taken a broad look at how humans interact with generative AI, but it lacks specificity regarding the user interface designs and patterns used to create these applications. Therefore, we present a survey that comprehensively presents taxonomies of how a human interacts with AI and the user interaction patterns designed to meet the needs of a variety of relevant use cases. We focus primarily on user-guided interactions, surveying interactions that are initiated by the user and do not include any implicit signals given by the user. With this survey, we aim to create a compendium of different user-interaction patterns that can be used as a reference for designers and developers alike. In doing so, we also strive to lower the entry barrier for those attempting to learn more about the design of generative AI applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22370v1-abstract-full').style.display = 'none'; document.getElementById('2410.22370v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.22128">arXiv:2410.22128</a> <span> [<a href="https://arxiv.org/pdf/2410.22128">pdf</a>, <a href="https://arxiv.org/format/2410.22128">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> PF3plat: Pose-Free Feed-Forward 3D Gaussian Splatting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hong%2C+S">Sunghwan Hong</a>, <a href="/search/cs?searchtype=author&query=Jung%2C+J">Jaewoo Jung</a>, <a href="/search/cs?searchtype=author&query=Shin%2C+H">Heeseong Shin</a>, <a href="/search/cs?searchtype=author&query=Han%2C+J">Jisang Han</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jiaolong Yang</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+C">Chong Luo</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+S">Seungryong Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.22128v1-abstract-short" style="display: inline;"> We consider the problem of novel view synthesis from unposed images in a single feed-forward. Our framework capitalizes on fast speed, scalability, and high-quality 3D reconstruction and view synthesis capabilities of 3DGS, where we further extend it to offer a practical solution that relaxes common assumptions such as dense image views, accurate camera poses, and substantial image overlaps. We ac… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22128v1-abstract-full').style.display = 'inline'; document.getElementById('2410.22128v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.22128v1-abstract-full" style="display: none;"> We consider the problem of novel view synthesis from unposed images in a single feed-forward. Our framework capitalizes on fast speed, scalability, and high-quality 3D reconstruction and view synthesis capabilities of 3DGS, where we further extend it to offer a practical solution that relaxes common assumptions such as dense image views, accurate camera poses, and substantial image overlaps. We achieve this through identifying and addressing unique challenges arising from the use of pixel-aligned 3DGS: misaligned 3D Gaussians across different views induce noisy or sparse gradients that destabilize training and hinder convergence, especially when above assumptions are not met. To mitigate this, we employ pre-trained monocular depth estimation and visual correspondence models to achieve coarse alignments of 3D Gaussians. We then introduce lightweight, learnable modules to refine depth and pose estimates from the coarse alignments, improving the quality of 3D reconstruction and novel view synthesis. Furthermore, the refined estimates are leveraged to estimate geometry confidence scores, which assess the reliability of 3D Gaussian centers and condition the prediction of Gaussian parameters accordingly. Extensive evaluations on large-scale real-world datasets demonstrate that PF3plat sets a new state-of-the-art across all benchmarks, supported by comprehensive ablation studies validating our design choices. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22128v1-abstract-full').style.display = 'none'; document.getElementById('2410.22128v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">project page: https://cvlab-kaist.github.io/PF3plat/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20672">arXiv:2410.20672</a> <span> [<a href="https://arxiv.org/pdf/2410.20672">pdf</a>, <a href="https://arxiv.org/format/2410.20672">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Relaxed Recursive Transformers: Effective Parameter Sharing with Layer-wise LoRA </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bae%2C+S">Sangmin Bae</a>, <a href="/search/cs?searchtype=author&query=Fisch%2C+A">Adam Fisch</a>, <a href="/search/cs?searchtype=author&query=Harutyunyan%2C+H">Hrayr Harutyunyan</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+Z">Ziwei Ji</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+S">Seungyeon Kim</a>, <a href="/search/cs?searchtype=author&query=Schuster%2C+T">Tal Schuster</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20672v1-abstract-short" style="display: inline;"> Large language models (LLMs) are expensive to deploy. Parameter sharing offers a possible path towards reducing their size and cost, but its effectiveness in modern LLMs remains fairly limited. In this work, we revisit "layer tying" as form of parameter sharing in Transformers, and introduce novel methods for converting existing LLMs into smaller "Recursive Transformers" that share parameters acro… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20672v1-abstract-full').style.display = 'inline'; document.getElementById('2410.20672v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20672v1-abstract-full" style="display: none;"> Large language models (LLMs) are expensive to deploy. Parameter sharing offers a possible path towards reducing their size and cost, but its effectiveness in modern LLMs remains fairly limited. In this work, we revisit "layer tying" as form of parameter sharing in Transformers, and introduce novel methods for converting existing LLMs into smaller "Recursive Transformers" that share parameters across layers, with minimal loss of performance. Here, our Recursive Transformers are efficiently initialized from standard pretrained Transformers, but only use a single block of unique layers that is then repeated multiple times in a loop. We further improve performance by introducing Relaxed Recursive Transformers that add flexibility to the layer tying constraint via depth-wise low-rank adaptation (LoRA) modules, yet still preserve the compactness of the overall model. We show that our recursive models (e.g., recursive Gemma 1B) outperform both similar-sized vanilla pretrained models (such as TinyLlama 1.1B and Pythia 1B) and knowledge distillation baselines -- and can even recover most of the performance of the original "full-size" model (e.g., Gemma 2B with no shared parameters). Finally, we propose Continuous Depth-wise Batching, a promising new inference paradigm enabled by the Recursive Transformer when paired with early exiting. In a theoretical analysis, we show that this has the potential to lead to significant (2-3x) gains in inference throughput. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20672v1-abstract-full').style.display = 'none'; document.getElementById('2410.20672v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">48 pages, 17 figures, 17 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20366">arXiv:2410.20366</a> <span> [<a href="https://arxiv.org/pdf/2410.20366">pdf</a>, <a href="https://arxiv.org/format/2410.20366">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> </div> </div> <p class="title is-5 mathjax"> Rethinking Reconstruction-based Graph-Level Anomaly Detection: Limitations and a Simple Remedy </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kim%2C+S">Sunwoo Kim</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+S+Y">Soo Yong Lee</a>, <a href="/search/cs?searchtype=author&query=Bu%2C+F">Fanchen Bu</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+S">Shinhwan Kang</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+K">Kyungho Kim</a>, <a href="/search/cs?searchtype=author&query=Yoo%2C+J">Jaemin Yoo</a>, <a href="/search/cs?searchtype=author&query=Shin%2C+K">Kijung Shin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20366v1-abstract-short" style="display: inline;"> Graph autoencoders (Graph-AEs) learn representations of given graphs by aiming to accurately reconstruct them. A notable application of Graph-AEs is graph-level anomaly detection (GLAD), whose objective is to identify graphs with anomalous topological structures and/or node features compared to the majority of the graph population. Graph-AEs for GLAD regard a graph with a high mean reconstruction… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20366v1-abstract-full').style.display = 'inline'; document.getElementById('2410.20366v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20366v1-abstract-full" style="display: none;"> Graph autoencoders (Graph-AEs) learn representations of given graphs by aiming to accurately reconstruct them. A notable application of Graph-AEs is graph-level anomaly detection (GLAD), whose objective is to identify graphs with anomalous topological structures and/or node features compared to the majority of the graph population. Graph-AEs for GLAD regard a graph with a high mean reconstruction error (i.e. mean of errors from all node pairs and/or nodes) as anomalies. Namely, the methods rest on the assumption that they would better reconstruct graphs with similar characteristics to the majority. We, however, report non-trivial counter-examples, a phenomenon we call reconstruction flip, and highlight the limitations of the existing Graph-AE-based GLAD methods. Specifically, we empirically and theoretically investigate when this assumption holds and when it fails. Through our analyses, we further argue that, while the reconstruction errors for a given graph are effective features for GLAD, leveraging the multifaceted summaries of the reconstruction errors, beyond just mean, can further strengthen the features. Thus, we propose a novel and simple GLAD method, named MUSE. The key innovation of MUSE involves taking multifaceted summaries of reconstruction errors as graph features for GLAD. This surprisingly simple method obtains SOTA performance in GLAD, performing best overall among 14 methods across 10 datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20366v1-abstract-full').style.display = 'none'; document.getElementById('2410.20366v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published as a conference paper at NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20011">arXiv:2410.20011</a> <span> [<a href="https://arxiv.org/pdf/2410.20011">pdf</a>, <a href="https://arxiv.org/format/2410.20011">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> A Survey of Small Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Van+Nguyen%2C+C">Chien Van Nguyen</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+X">Xuan Shen</a>, <a href="/search/cs?searchtype=author&query=Aponte%2C+R">Ryan Aponte</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+Y">Yu Xia</a>, <a href="/search/cs?searchtype=author&query=Basu%2C+S">Samyadeep Basu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Z">Zhengmian Hu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jian Chen</a>, <a href="/search/cs?searchtype=author&query=Parmar%2C+M">Mihir Parmar</a>, <a href="/search/cs?searchtype=author&query=Kunapuli%2C+S">Sasidhar Kunapuli</a>, <a href="/search/cs?searchtype=author&query=Barrow%2C+J">Joe Barrow</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Junda Wu</a>, <a href="/search/cs?searchtype=author&query=Singh%2C+A">Ashish Singh</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yu Wang</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+J">Jiuxiang Gu</a>, <a href="/search/cs?searchtype=author&query=Dernoncourt%2C+F">Franck Dernoncourt</a>, <a href="/search/cs?searchtype=author&query=Ahmed%2C+N+K">Nesreen K. Ahmed</a>, <a href="/search/cs?searchtype=author&query=Lipka%2C+N">Nedim Lipka</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Ruiyi Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xiang Chen</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+T">Tong Yu</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+S">Sungchul Kim</a>, <a href="/search/cs?searchtype=author&query=Deilamsalehy%2C+H">Hanieh Deilamsalehy</a>, <a href="/search/cs?searchtype=author&query=Park%2C+N">Namyong Park</a>, <a href="/search/cs?searchtype=author&query=Rimer%2C+M">Mike Rimer</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhehao Zhang</a> , et al. (3 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20011v1-abstract-short" style="display: inline;"> Small Language Models (SLMs) have become increasingly important due to their efficiency and performance to perform various language tasks with minimal computational resources, making them ideal for various settings including on-device, mobile, edge devices, among many others. In this article, we present a comprehensive survey on SLMs, focusing on their architectures, training techniques, and model… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20011v1-abstract-full').style.display = 'inline'; document.getElementById('2410.20011v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20011v1-abstract-full" style="display: none;"> Small Language Models (SLMs) have become increasingly important due to their efficiency and performance to perform various language tasks with minimal computational resources, making them ideal for various settings including on-device, mobile, edge devices, among many others. In this article, we present a comprehensive survey on SLMs, focusing on their architectures, training techniques, and model compression techniques. We propose a novel taxonomy for categorizing the methods used to optimize SLMs, including model compression, pruning, and quantization techniques. We summarize the benchmark datasets that are useful for benchmarking SLMs along with the evaluation metrics commonly used. Additionally, we highlight key open challenges that remain to be addressed. Our survey aims to serve as a valuable resource for researchers and practitioners interested in developing and deploying small yet efficient language models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20011v1-abstract-full').style.display = 'none'; document.getElementById('2410.20011v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.19341">arXiv:2410.19341</a> <span> [<a href="https://arxiv.org/pdf/2410.19341">pdf</a>, <a href="https://arxiv.org/format/2410.19341">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Context-Based Visual-Language Place Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Woo%2C+S">Soojin Woo</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+S">Seong-Woo Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.19341v1-abstract-short" style="display: inline;"> In vision-based robot localization and SLAM, Visual Place Recognition (VPR) is essential. This paper addresses the problem of VPR, which involves accurately recognizing the location corresponding to a given query image. A popular approach to vision-based place recognition relies on low-level visual features. Despite significant progress in recent years, place recognition based on low-level visual… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19341v1-abstract-full').style.display = 'inline'; document.getElementById('2410.19341v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.19341v1-abstract-full" style="display: none;"> In vision-based robot localization and SLAM, Visual Place Recognition (VPR) is essential. This paper addresses the problem of VPR, which involves accurately recognizing the location corresponding to a given query image. A popular approach to vision-based place recognition relies on low-level visual features. Despite significant progress in recent years, place recognition based on low-level visual features is challenging when there are changes in scene appearance. To address this, end-to-end training approaches have been proposed to overcome the limitations of hand-crafted features. However, these approaches still fail under drastic changes and require large amounts of labeled data to train models, presenting a significant limitation. Methods that leverage high-level semantic information, such as objects or categories, have been proposed to handle variations in appearance. In this paper, we introduce a novel VPR approach that remains robust to scene changes and does not require additional training. Our method constructs semantic image descriptors by extracting pixel-level embeddings using a zero-shot, language-driven semantic segmentation model. We validate our approach in challenging place recognition scenarios using real-world public dataset. The experiments demonstrate that our method outperforms non-learned image representation techniques and off-the-shelf convolutional neural network (CNN) descriptors. Our code is available at https: //github.com/woo-soojin/context-based-vlpr. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19341v1-abstract-full').style.display = 'none'; document.getElementById('2410.19341v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.19022">arXiv:2410.19022</a> <span> [<a href="https://arxiv.org/pdf/2410.19022">pdf</a>, <a href="https://arxiv.org/format/2410.19022">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Heterogeneous Random Forest </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kim%2C+Y">Ye-eun Kim</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+S+Y">Seoung Yun Kim</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+H">Hyunjoong Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.19022v1-abstract-short" style="display: inline;"> Random forest (RF) stands out as a highly favored machine learning approach for classification problems. The effectiveness of RF hinges on two key factors: the accuracy of individual trees and the diversity among them. In this study, we introduce a novel approach called heterogeneous RF (HRF), designed to enhance tree diversity in a meaningful way. This diversification is achieved by deliberately… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19022v1-abstract-full').style.display = 'inline'; document.getElementById('2410.19022v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.19022v1-abstract-full" style="display: none;"> Random forest (RF) stands out as a highly favored machine learning approach for classification problems. The effectiveness of RF hinges on two key factors: the accuracy of individual trees and the diversity among them. In this study, we introduce a novel approach called heterogeneous RF (HRF), designed to enhance tree diversity in a meaningful way. This diversification is achieved by deliberately introducing heterogeneity during the tree construction. Specifically, features used for splitting near the root node of previous trees are assigned lower weights when constructing the feature sub-space of the subsequent trees. As a result, dominant features in the prior trees are less likely to be employed in the next iteration, leading to a more diverse set of splitting features at the nodes. Through simulation studies, it was confirmed that the HRF method effectively mitigates the selection bias of trees within the ensemble, increases the diversity of the ensemble, and demonstrates superior performance on datasets with fewer noise features. To assess the comparative performance of HRF against other widely adopted ensemble methods, we conducted tests on 52 datasets, comprising both real-world and synthetic data. HRF consistently outperformed other ensemble methods in terms of accuracy across the majority of datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19022v1-abstract-full').style.display = 'none'; document.getElementById('2410.19022v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">18 pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18779">arXiv:2410.18779</a> <span> [<a href="https://arxiv.org/pdf/2410.18779">pdf</a>, <a href="https://arxiv.org/format/2410.18779">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> A Little Help Goes a Long Way: Efficient LLM Training by Leveraging Small LMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Rawat%2C+A+S">Ankit Singh Rawat</a>, <a href="/search/cs?searchtype=author&query=Sadhanala%2C+V">Veeranjaneyulu Sadhanala</a>, <a href="/search/cs?searchtype=author&query=Rostamizadeh%2C+A">Afshin Rostamizadeh</a>, <a href="/search/cs?searchtype=author&query=Chakrabarti%2C+A">Ayan Chakrabarti</a>, <a href="/search/cs?searchtype=author&query=Jitkrittum%2C+W">Wittawat Jitkrittum</a>, <a href="/search/cs?searchtype=author&query=Feinberg%2C+V">Vladimir Feinberg</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+S">Seungyeon Kim</a>, <a href="/search/cs?searchtype=author&query=Harutyunyan%2C+H">Hrayr Harutyunyan</a>, <a href="/search/cs?searchtype=author&query=Saunshi%2C+N">Nikunj Saunshi</a>, <a href="/search/cs?searchtype=author&query=Nado%2C+Z">Zachary Nado</a>, <a href="/search/cs?searchtype=author&query=Shivanna%2C+R">Rakesh Shivanna</a>, <a href="/search/cs?searchtype=author&query=Reddi%2C+S+J">Sashank J. Reddi</a>, <a href="/search/cs?searchtype=author&query=Menon%2C+A+K">Aditya Krishna Menon</a>, <a href="/search/cs?searchtype=author&query=Anil%2C+R">Rohan Anil</a>, <a href="/search/cs?searchtype=author&query=Kumar%2C+S">Sanjiv Kumar</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18779v1-abstract-short" style="display: inline;"> A primary challenge in large language model (LLM) development is their onerous pre-training cost. Typically, such pre-training involves optimizing a self-supervised objective (such as next-token prediction) over a large corpus. This paper explores a promising paradigm to improve LLM pre-training efficiency and quality by suitably leveraging a small language model (SLM). In particular, this paradig… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18779v1-abstract-full').style.display = 'inline'; document.getElementById('2410.18779v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18779v1-abstract-full" style="display: none;"> A primary challenge in large language model (LLM) development is their onerous pre-training cost. Typically, such pre-training involves optimizing a self-supervised objective (such as next-token prediction) over a large corpus. This paper explores a promising paradigm to improve LLM pre-training efficiency and quality by suitably leveraging a small language model (SLM). In particular, this paradigm relies on an SLM to both (1) provide soft labels as additional training supervision, and (2) select a small subset of valuable ("informative" and "hard") training examples. Put together, this enables an effective transfer of the SLM's predictive distribution to the LLM, while prioritizing specific regions of the training data distribution. Empirically, this leads to reduced LLM training time compared to standard training, while improving the overall quality. Theoretically, we develop a statistical framework to systematically study the utility of SLMs in enabling efficient training of high-quality LLMs. In particular, our framework characterizes how the SLM's seemingly low-quality supervision can enhance the training of a much more capable LLM. Furthermore, it also highlights the need for an adaptive utilization of such supervision, by striking a balance between the bias and variance introduced by the SLM-provided soft labels. We corroborate our theoretical framework by improving the pre-training of an LLM with 2.8B parameters by utilizing a smaller LM with 1.5B parameters on the Pile dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18779v1-abstract-full').style.display = 'none'; document.getElementById('2410.18779v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18436">arXiv:2410.18436</a> <span> [<a href="https://arxiv.org/pdf/2410.18436">pdf</a>, <a href="https://arxiv.org/format/2410.18436">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Can Code-Switched Texts Activate a Knowledge Switch in LLMs? A Case Study on English-Korean Code-Switching </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kim%2C+S">Seoyeon Kim</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+H">Huiseo Kim</a>, <a href="/search/cs?searchtype=author&query=Park%2C+C">Chanjun Park</a>, <a href="/search/cs?searchtype=author&query=Yeo%2C+J">Jinyoung Yeo</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+D">Dongha Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18436v1-abstract-short" style="display: inline;"> Code-switching (CS), a phenomenon where multilingual speakers alternate between languages in a discourse, can convey subtle cultural and linguistic nuances that can be otherwise lost in translation. Recent state-of-the-art multilingual large language models (LLMs) demonstrate excellent multilingual abilities in various aspects including understanding CS, but the power of CS in eliciting language-s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18436v1-abstract-full').style.display = 'inline'; document.getElementById('2410.18436v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18436v1-abstract-full" style="display: none;"> Code-switching (CS), a phenomenon where multilingual speakers alternate between languages in a discourse, can convey subtle cultural and linguistic nuances that can be otherwise lost in translation. Recent state-of-the-art multilingual large language models (LLMs) demonstrate excellent multilingual abilities in various aspects including understanding CS, but the power of CS in eliciting language-specific knowledge is yet to be discovered. Therefore, we investigate the effectiveness of code-switching on a wide range of multilingual LLMs in terms of knowledge activation, or the act of identifying and leveraging knowledge for reasoning. To facilitate the research, we first present EnKoQA, a synthetic English-Korean CS question-answering dataset. We provide a comprehensive analysis on a variety of multilingual LLMs by subdividing activation process into knowledge identification and knowledge leveraging. Our experiments demonstrate that compared to English text, CS can faithfully activate knowledge inside LLMs, especially on language-specific domains. In addition, the performance gap between CS and English is larger in models that show excellent monolingual abilities, suggesting that there exists a correlation with CS and Korean proficiency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18436v1-abstract-full').style.display = 'none'; document.getElementById('2410.18436v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">19 pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18097">arXiv:2410.18097</a> <span> [<a href="https://arxiv.org/pdf/2410.18097">pdf</a>, <a href="https://arxiv.org/format/2410.18097">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> RRADistill: Distilling LLMs' Passage Ranking Ability for Long-Tail Queries Document Re-Ranking on a Search Engine </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Choi%2C+N">Nayoung Choi</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+Y">Youngjune Lee</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+G">Gyu-Hwung Cho</a>, <a href="/search/cs?searchtype=author&query=Jeong%2C+H">Haeyu Jeong</a>, <a href="/search/cs?searchtype=author&query=Kong%2C+J">Jungmin Kong</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+S">Saehun Kim</a>, <a href="/search/cs?searchtype=author&query=Park%2C+K">Keunchan Park</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+S">Sarah Cho</a>, <a href="/search/cs?searchtype=author&query=Jeong%2C+I">Inchang Jeong</a>, <a href="/search/cs?searchtype=author&query=Nam%2C+G">Gyohee Nam</a>, <a href="/search/cs?searchtype=author&query=Han%2C+S">Sunghoon Han</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+W">Wonil Yang</a>, <a href="/search/cs?searchtype=author&query=Choi%2C+J">Jaeho Choi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18097v3-abstract-short" style="display: inline;"> Large Language Models (LLMs) excel at understanding the semantic relationships between queries and documents, even with lengthy and complex long-tail queries. These queries are challenging for feedback-based rankings due to sparse user engagement and limited feedback, making LLMs' ranking ability highly valuable. However, the large size and slow inference of LLMs necessitate the development of sma… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18097v3-abstract-full').style.display = 'inline'; document.getElementById('2410.18097v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18097v3-abstract-full" style="display: none;"> Large Language Models (LLMs) excel at understanding the semantic relationships between queries and documents, even with lengthy and complex long-tail queries. These queries are challenging for feedback-based rankings due to sparse user engagement and limited feedback, making LLMs' ranking ability highly valuable. However, the large size and slow inference of LLMs necessitate the development of smaller, more efficient models (sLLMs). Recently, integrating ranking label generation into distillation techniques has become crucial, but existing methods underutilize LLMs' capabilities and are cumbersome. Our research, RRADistill: Re-Ranking Ability Distillation, propose an efficient label generation pipeline and novel sLLM training methods for both encoder and decoder models. We introduce an encoder-based method using a Term Control Layer to capture term matching signals and a decoder-based model with a ranking layer for enhanced understanding. A/B testing on a Korean-based search platform, validates the effectiveness of our approach in improving re-ranking for long-tail queries. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18097v3-abstract-full').style.display = 'none'; document.getElementById('2410.18097v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to EMNLP 2024 Industry Track. First two authors contributed equally</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18087">arXiv:2410.18087</a> <span> [<a href="https://arxiv.org/pdf/2410.18087">pdf</a>, <a href="https://arxiv.org/format/2410.18087">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> CUPID: A Real-Time Session-Based Reciprocal Recommendation System for a One-on-One Social Discovery Platform </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kim%2C+B">Beomsu Kim</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+S">Sangbum Kim</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+M">Minchan Kim</a>, <a href="/search/cs?searchtype=author&query=Yi%2C+J">Joonyoung Yi</a>, <a href="/search/cs?searchtype=author&query=Ha%2C+S">Sungjoo Ha</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+S">Suhyun Lee</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+Y">Youngsoo Lee</a>, <a href="/search/cs?searchtype=author&query=Yeom%2C+G">Gihun Yeom</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+B">Buru Chang</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+G">Gihun Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18087v1-abstract-short" style="display: inline;"> This study introduces CUPID, a novel approach to session-based reciprocal recommendation systems designed for a real-time one-on-one social discovery platform. In such platforms, low latency is critical to enhance user experiences. However, conventional session-based approaches struggle with high latency due to the demands of modeling sequential user behavior for each recommendation process. Addit… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18087v1-abstract-full').style.display = 'inline'; document.getElementById('2410.18087v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18087v1-abstract-full" style="display: none;"> This study introduces CUPID, a novel approach to session-based reciprocal recommendation systems designed for a real-time one-on-one social discovery platform. In such platforms, low latency is critical to enhance user experiences. However, conventional session-based approaches struggle with high latency due to the demands of modeling sequential user behavior for each recommendation process. Additionally, given the reciprocal nature of the platform, where users act as items for each other, training recommendation models on large-scale datasets is computationally prohibitive using conventional methods. To address these challenges, CUPID decouples the time-intensive user session modeling from the real-time user matching process to reduce inference time. Furthermore, CUPID employs a two-phase training strategy that separates the training of embedding and prediction layers, significantly reducing the computational burden by decreasing the number of sequential model inferences by several hundredfold. Extensive experiments on large-scale Azar datasets demonstrate CUPID's effectiveness in a real-world production environment. Notably, CUPID reduces response latency by more than 76% compared to non-asynchronous systems, while significantly improving user engagement. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18087v1-abstract-full').style.display = 'none'; document.getElementById('2410.18087v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The 2nd International Workshop on User Understanding from Big Data Workshop (DMU2 2024)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18001">arXiv:2410.18001</a> <span> [<a href="https://arxiv.org/pdf/2410.18001">pdf</a>, <a href="https://arxiv.org/format/2410.18001">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Benchmarking Foundation Models on Exceptional Cases: Dataset Creation and Validation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kang%2C+S">Suho Kang</a>, <a href="/search/cs?searchtype=author&query=Park%2C+J">Jungyang Park</a>, <a href="/search/cs?searchtype=author&query=Ha%2C+J">Joonseo Ha</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+S">SoMin Kim</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+J">JinHyeong Kim</a>, <a href="/search/cs?searchtype=author&query=Park%2C+S">Subeen Park</a>, <a href="/search/cs?searchtype=author&query=Song%2C+K">Kyungwoo Song</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18001v1-abstract-short" style="display: inline;"> Foundation models (FMs) have achieved significant success across various tasks, leading to research on benchmarks for reasoning abilities. However, there is a lack of studies on FMs performance in exceptional scenarios, which we define as out-of-distribution (OOD) reasoning tasks. This paper is the first to address these cases, developing a novel dataset for evaluation of FMs across multiple modal… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18001v1-abstract-full').style.display = 'inline'; document.getElementById('2410.18001v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18001v1-abstract-full" style="display: none;"> Foundation models (FMs) have achieved significant success across various tasks, leading to research on benchmarks for reasoning abilities. However, there is a lack of studies on FMs performance in exceptional scenarios, which we define as out-of-distribution (OOD) reasoning tasks. This paper is the first to address these cases, developing a novel dataset for evaluation of FMs across multiple modalities, including graphic novels, calligraphy, news articles, and lyrics. It includes tasks for instance classification, character recognition, token prediction, and text generation. The paper also proposes prompt engineering techniques like Chain-of-Thought (CoT) and CoT+Few-Shot to enhance performance. Validation of FMs using various methods revealed improvements. The code repository is accessible at: https://github.com/MLAI-Yonsei/ExceptionalBenchmark <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18001v1-abstract-full').style.display = 'none'; document.getElementById('2410.18001v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">EMNLP 2024 Workshop Genbench(https://genbench.org/workshop_programme/)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.17712">arXiv:2410.17712</a> <span> [<a href="https://arxiv.org/pdf/2410.17712">pdf</a>, <a href="https://arxiv.org/format/2410.17712">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> A Data-Driven Odyssey in Solar Vehicles </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kim%2C+D+Y">Do Young Kim</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+K">Kyunghyun Kim</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+G">Gyeongseop Lee</a>, <a href="/search/cs?searchtype=author&query=Das%2C+N">Niloy Das</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+S">Seong-Woo Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.17712v1-abstract-short" style="display: inline;"> Solar vehicles, which simultaneously produce and consume energy, require meticulous energy management. However, potential users often feel uncertain about their operation compared to conventional vehicles. This study presents a simulator designed to help users understand long-distance travel in solar vehicles and recognize the importance of proper energy management. By utilizing Google Maps data a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17712v1-abstract-full').style.display = 'inline'; document.getElementById('2410.17712v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.17712v1-abstract-full" style="display: none;"> Solar vehicles, which simultaneously produce and consume energy, require meticulous energy management. However, potential users often feel uncertain about their operation compared to conventional vehicles. This study presents a simulator designed to help users understand long-distance travel in solar vehicles and recognize the importance of proper energy management. By utilizing Google Maps data and weather information, the simulator replicates real-world driving conditions and provides a dashboard displaying vehicle status, updated hourly based on user-inputted speed. Users can explore various speed policy scenarios and receive recommendations for optimal driving strategies. The simulator's effectiveness was validated using the route of the World Solar Challenge (WSC). This research enables users to monitor energy dynamics before a journey, enhancing their understanding of energy management and informing appropriate speed decisions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17712v1-abstract-full').style.display = 'none'; document.getElementById('2410.17712v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.17578">arXiv:2410.17578</a> <span> [<a href="https://arxiv.org/pdf/2410.17578">pdf</a>, <a href="https://arxiv.org/format/2410.17578">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> MM-Eval: A Multilingual Meta-Evaluation Benchmark for LLM-as-a-Judge and Reward Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Son%2C+G">Guijin Son</a>, <a href="/search/cs?searchtype=author&query=Yoon%2C+D">Dongkeun Yoon</a>, <a href="/search/cs?searchtype=author&query=Suk%2C+J">Juyoung Suk</a>, <a href="/search/cs?searchtype=author&query=Aula-Blasco%2C+J">Javier Aula-Blasco</a>, <a href="/search/cs?searchtype=author&query=Aslan%2C+M">Mano Aslan</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+V+T">Vu Trong Kim</a>, <a href="/search/cs?searchtype=author&query=Islam%2C+S+B">Shayekh Bin Islam</a>, <a href="/search/cs?searchtype=author&query=Prats-Cristi%C3%A0%2C+J">Jaume Prats-Cristi脿</a>, <a href="/search/cs?searchtype=author&query=Tormo-Ba%C3%B1uelos%2C+L">Luc铆a Tormo-Ba帽uelos</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+S">Seungone Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.17578v1-abstract-short" style="display: inline;"> Large language models (LLMs) are commonly used as evaluators in tasks (e.g., reward modeling, LLM-as-a-judge), where they act as proxies for human preferences or judgments. This leads to the need for meta-evaluation: evaluating the credibility of LLMs as evaluators. However, existing benchmarks primarily focus on English, offering limited insight into LLMs' effectiveness as evaluators in non-Engli… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17578v1-abstract-full').style.display = 'inline'; document.getElementById('2410.17578v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.17578v1-abstract-full" style="display: none;"> Large language models (LLMs) are commonly used as evaluators in tasks (e.g., reward modeling, LLM-as-a-judge), where they act as proxies for human preferences or judgments. This leads to the need for meta-evaluation: evaluating the credibility of LLMs as evaluators. However, existing benchmarks primarily focus on English, offering limited insight into LLMs' effectiveness as evaluators in non-English contexts. To address this, we introduce MM-Eval, a multilingual meta-evaluation benchmark that covers 18 languages across six categories. MM-Eval evaluates various dimensions, including language-specific challenges like linguistics and language hallucinations. Evaluation results show that both proprietary and open-source language models have considerable room for improvement. Further analysis reveals a tendency for these models to assign middle-ground scores to low-resource languages. We publicly release our benchmark and code. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17578v1-abstract-full').style.display = 'none'; document.getElementById('2410.17578v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">work in progress</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Kim%2C+S&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Kim%2C+S&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Kim%2C+S&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Kim%2C+S&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Kim%2C+S&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Kim%2C+S&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository